diff --git a/app/backend/InputOutputHandler.tsx b/app/backend/InputOutputHandler.tsx index 381ef88..36b3fac 100644 --- a/app/backend/InputOutputHandler.tsx +++ b/app/backend/InputOutputHandler.tsx @@ -151,7 +151,7 @@ const InputOutputBackend: React.FC = () => { } mediaRecorder.onstop = () => { - const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) + const audioBlob = new Blob(audioChunks.current, { type: "audio/wav" }) const url = URL.createObjectURL(audioBlob) console.log(url); setAudioURL(url) diff --git a/app/backend/voice_backend.ts b/app/backend/voice_backend.ts index 3c4193b..a93fd89 100644 --- a/app/backend/voice_backend.ts +++ b/app/backend/voice_backend.ts @@ -4,9 +4,13 @@ import axios from "axios"; class VoiceSend { sendToVoiceRecognition(audio_data: Blob) { console.log("sending recording..."); + console.log(typeof (audio_data)); + console.log(audio_data instanceof Blob); const formdata = new FormData() formdata.append("audio", audio_data) + formdata.append("option", "offline") + formdata.append("type", "basic") const dataSend = { option:"offline", type:"basic",audio:audio_data } axios.post("http://localhost:5000/interstellar_ai/api/voice_recognition", formdata) diff --git a/py/requirements.txt b/py/requirements.txt index 212a41d..7ea1a85 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -14,5 +14,4 @@ pycouchdb pyttsx3 pip-licenses openai-whisper -pydub -ffmpeg \ No newline at end of file +pydub \ No newline at end of file diff --git a/py/voice.py b/py/voice.py index cd0d657..b4516f0 100644 --- a/py/voice.py +++ b/py/voice.py @@ -1,24 +1,10 @@ -import io -import numpy as np import whisper -from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): - audio_data = audio.read() - with io.BytesIO(audio_data) as audio_buffer: - audio_segment = AudioSegment.from_ogg(audio_buffer) - - raw_data = np.array(audio_segment.get_array_of_samples()) - - if audio_segment.channels > 1: - raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] - - audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) - model = whisper.load_model("base") - result = model.transcribe(audio_data) + result = model.transcribe(audio) print(result["text"]) return result["text"]