diff --git a/py/voice.py b/py/voice.py index 8aeb3e0..536407f 100644 --- a/py/voice.py +++ b/py/voice.py @@ -6,20 +6,28 @@ from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): + # Read the audio file into a BytesIO buffer audio_buffer = io.BytesIO(audio.read()) + # Load the audio file using pydub audio_segment = AudioSegment.from_file(audio_buffer, format="ogg") + # Export the audio to a WAV format in a BytesIO buffer wav_io = io.BytesIO() audio_segment.export(wav_io, format="wav") - wav_io.seek(0) + wav_io.seek(0) # Reset the buffer pointer to the start - model_size = "base" + # Load the Whisper model + model_size = "base" # Specify the model size model = WhisperModel(model_size, device="cpu", compute_type="int8") + # Transcribe the audio segments, _ = model.transcribe(wav_io) transcription = "" + + # Combine the transcribed segments into a single string for segment in segments: transcription += segment.text + " " - result = transcription.strip() + + result = transcription.strip() # Strip any leading/trailing whitespace return result