diff --git a/py/voice_recognition.py b/py/voice_recognition.py index 8ffc199..b9390c5 100644 --- a/py/voice_recognition.py +++ b/py/voice_recognition.py @@ -1,31 +1,61 @@ -import speech_recognition as sr +#pip install faster-whisper +from api import API +import os +import wave +from faster_whisper import WhisperModel +import pyaudio -class Voice: #create Class - @staticmethod - def listen(): #define function listen() - recognizer = sr.Recognizer() - - try: - with sr.Microphone() as source: - print("Adjusting for ambient noise...") - recognizer.adjust_for_ambient_noise(source, duration=0.5) #listen to surrounding for .5sec to adjust backgroundnoise - print("Listening...") - audio_data = recognizer.listen(source) #listen to user until user stops speaking - print("Audio captured") - try: - text = recognizer.recognize_sphinx(audio_data) # Using Sphinx convert audio to text (also works offline) - #if any Exceptions or Errors eccur => return ERROR - except sr.UnknownValueError: - text = "ERROR" - except sr.RequestError as e: - text = "ERROR" - - except sr.RequestError as e: - text = "ERROR" - except sr.UnknownValueError: - text = "ERROR" - except Exception as e: - text = "ERROR" - return text +def transcribe_chunk(model, file_path): + print("transcribing...") + segments, _ = model.transcribe(file_path) + transcription = "" + for segment in segments: + transcription += segment.text + " " + return transcription.strip() +#Record voice +#chunk_length = duration to record in seconds +def record_chunk(p, stream, file_path, chunk_length=2): + frames=[] + for _ in range(0, int(16000/1024 * chunk_length)): + data = stream.read(1024) + frames.append(data) + wf = wave.open(file_path, 'wb') + wf.setnchannels(1) + wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) + wf.setframerate(16000) + wf.writeframes(b''.join(frames)) + wf.close() + +def listen(): + #model settings (tiny, base, small, medium, large) + model_size = "medium" + #what should it run on (cpu or cuda for gpu) + model = WhisperModel(model_size, device="cpu", compute_type="int8") + + p = pyaudio.PyAudio() + stream = p.open(format=pyaudio.paInt16, channels=1, rate = 16000, input = True, frames_per_buffer=1024) + + try: + while True: + print("Recording...") + # CHANGE TEMP FILE PATH + chunk_file="temp_chunk.wav" + record_chunk(p, stream, chunk_file) + transcription = transcribe_chunk(model, chunk_file) + print(transcription) + try: + return transcription + except Exception as e: + return "ERROR" + finally: + if os.path.exists(chunk_file): + os.remove(chunk_file) + break + except KeyboardInterrupt: + print("Stopping...") + finally: + stream.stop_stream() + stream.close() + p.terminate() \ No newline at end of file