From 1752a5745f959ca2619f102de65ab51522b316d0 Mon Sep 17 00:00:00 2001 From: YasinOnm08 Date: Thu, 26 Sep 2024 13:08:29 +0200 Subject: [PATCH] voice recognition trial 5 --- app/backend/InputOutputHandler.tsx | 2 +- app/backend/voice_backend.ts | 4 ---- py/requirements.txt | 3 ++- py/voice.py | 16 +++++++++++++++- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/app/backend/InputOutputHandler.tsx b/app/backend/InputOutputHandler.tsx index 36b3fac..381ef88 100644 --- a/app/backend/InputOutputHandler.tsx +++ b/app/backend/InputOutputHandler.tsx @@ -151,7 +151,7 @@ const InputOutputBackend: React.FC = () => { } mediaRecorder.onstop = () => { - const audioBlob = new Blob(audioChunks.current, { type: "audio/wav" }) + const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) const url = URL.createObjectURL(audioBlob) console.log(url); setAudioURL(url) diff --git a/app/backend/voice_backend.ts b/app/backend/voice_backend.ts index a93fd89..3c4193b 100644 --- a/app/backend/voice_backend.ts +++ b/app/backend/voice_backend.ts @@ -4,13 +4,9 @@ import axios from "axios"; class VoiceSend { sendToVoiceRecognition(audio_data: Blob) { console.log("sending recording..."); - console.log(typeof (audio_data)); - console.log(audio_data instanceof Blob); const formdata = new FormData() formdata.append("audio", audio_data) - formdata.append("option", "offline") - formdata.append("type", "basic") const dataSend = { option:"offline", type:"basic",audio:audio_data } axios.post("http://localhost:5000/interstellar_ai/api/voice_recognition", formdata) diff --git a/py/requirements.txt b/py/requirements.txt index 7ea1a85..212a41d 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -14,4 +14,5 @@ pycouchdb pyttsx3 pip-licenses openai-whisper -pydub \ No newline at end of file +pydub +ffmpeg \ No newline at end of file diff --git a/py/voice.py b/py/voice.py index b4516f0..cd0d657 100644 --- a/py/voice.py +++ b/py/voice.py @@ -1,10 +1,24 @@ +import io +import numpy as np import whisper +from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): + audio_data = audio.read() + with io.BytesIO(audio_data) as audio_buffer: + audio_segment = AudioSegment.from_ogg(audio_buffer) + + raw_data = np.array(audio_segment.get_array_of_samples()) + + if audio_segment.channels > 1: + raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] + + audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) + model = whisper.load_model("base") - result = model.transcribe(audio) + result = model.transcribe(audio_data) print(result["text"]) return result["text"]