From 8090ce969e00d3769cadcc77d58f5fac0328d7b8 Mon Sep 17 00:00:00 2001 From: YasinOnm08 Date: Fri, 27 Sep 2024 13:59:27 +0200 Subject: [PATCH] voice recognition kinda works?? --- app/backend/InputOutputHandler.tsx | 23 +++++++------------ py/api.py | 2 +- py/voice.py | 36 +++++++++++++++++------------- 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/app/backend/InputOutputHandler.tsx b/app/backend/InputOutputHandler.tsx index fa8dcf6..83c998a 100644 --- a/app/backend/InputOutputHandler.tsx +++ b/app/backend/InputOutputHandler.tsx @@ -167,16 +167,14 @@ const InputOutputBackend: React.FC = () => { mediaRecorder.onstop = async () => { const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) - console.log(audioBlob); - const url = URL.createObjectURL(audioBlob) - const audio = new Audio(url); - audio.play().catch(error => console.error("Error playing audio:", error)); - console.log(url); - setAudioURL(url) audioChunks.current = [] - const wavBlob = await convertOggToWav(audioBlob) + // console.log(audioBlob); + // const url = URL.createObjectURL(audioBlob) + // const audio = new Audio(url); + // audio.play().catch(error => console.error("Error playing audio:", error)); + const remote = new VoiceSend() - remote.sendToVoiceRecognition(wavBlob) + remote.sendToVoiceRecognition(audioBlob) } mediaRecorder.start() @@ -276,14 +274,9 @@ const InputOutputBackend: React.FC = () => { onMicClick={handleMicClick} inputDisabled={inputDisabled} isRecording={isRecording} - /> + /> ) } -export default InputOutputBackend - - - - - +export default InputOutputBackend \ No newline at end of file diff --git a/py/api.py b/py/api.py index c43bf1f..f3e13ea 100644 --- a/py/api.py +++ b/py/api.py @@ -1,4 +1,4 @@ -from flask i mport Flask, request, jsonify +from flask import Flask, request, jsonify from flask_cors import CORS import secrets import threading diff --git a/py/voice.py b/py/voice.py index 461da21..dc0d28b 100644 --- a/py/voice.py +++ b/py/voice.py @@ -1,26 +1,32 @@ import io import numpy as np -import whisper +from faster_whisper import WhisperModel from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): - audio_data = audio.read() - with io.BytesIO(audio_data) as audio_buffer: - audio_segment = AudioSegment.from_ogg(audio_buffer) - - raw_data = np.array(audio_segment.get_array_of_samples()) - - if audio_segment.channels > 1: - raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] - - audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) + audio_buffer = io.BytesIO(audio.read()) - model = whisper.load_model("base") - result = model.transcribe(audio_data) - print(result["text"]) - return result["text"] + try: + audio_segment = AudioSegment.from_file(audio_buffer, format="ogg") + + wav_io = io.BytesIO() + audio_segment.export(wav_io, format="wav") + wav_io.seek(0) + except: + print("audio to wav failed") + + model_size = "base" + model = WhisperModel(model_size, device="cpu", compute_type="int8") + + segments, _ = model.transcribe(wav_io) + transcription = "" + for segment in segments: + transcription += segment.text + " " + result = transcription.strip() + print(result) + return result # npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt \ No newline at end of file