forked from React-Group/interstellar_ai
voice recognition kinda works??
This commit is contained in:
parent
233a173697
commit
8090ce969e
3 changed files with 30 additions and 31 deletions
|
@ -167,16 +167,14 @@ const InputOutputBackend: React.FC = () => {
|
||||||
|
|
||||||
mediaRecorder.onstop = async () => {
|
mediaRecorder.onstop = async () => {
|
||||||
const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" })
|
const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" })
|
||||||
console.log(audioBlob);
|
|
||||||
const url = URL.createObjectURL(audioBlob)
|
|
||||||
const audio = new Audio(url);
|
|
||||||
audio.play().catch(error => console.error("Error playing audio:", error));
|
|
||||||
console.log(url);
|
|
||||||
setAudioURL(url)
|
|
||||||
audioChunks.current = []
|
audioChunks.current = []
|
||||||
const wavBlob = await convertOggToWav(audioBlob)
|
// console.log(audioBlob);
|
||||||
|
// const url = URL.createObjectURL(audioBlob)
|
||||||
|
// const audio = new Audio(url);
|
||||||
|
// audio.play().catch(error => console.error("Error playing audio:", error));
|
||||||
|
|
||||||
const remote = new VoiceSend()
|
const remote = new VoiceSend()
|
||||||
remote.sendToVoiceRecognition(wavBlob)
|
remote.sendToVoiceRecognition(audioBlob)
|
||||||
}
|
}
|
||||||
|
|
||||||
mediaRecorder.start()
|
mediaRecorder.start()
|
||||||
|
@ -282,8 +280,3 @@ const InputOutputBackend: React.FC = () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
export default InputOutputBackend
|
export default InputOutputBackend
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from flask i mport Flask, request, jsonify
|
from flask import Flask, request, jsonify
|
||||||
from flask_cors import CORS
|
from flask_cors import CORS
|
||||||
import secrets
|
import secrets
|
||||||
import threading
|
import threading
|
||||||
|
|
30
py/voice.py
30
py/voice.py
|
@ -1,26 +1,32 @@
|
||||||
import io
|
import io
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import whisper
|
from faster_whisper import WhisperModel
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
class VoiceRecognition:
|
class VoiceRecognition:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def recognition(audio):
|
def recognition(audio):
|
||||||
audio_data = audio.read()
|
audio_buffer = io.BytesIO(audio.read())
|
||||||
with io.BytesIO(audio_data) as audio_buffer:
|
|
||||||
audio_segment = AudioSegment.from_ogg(audio_buffer)
|
|
||||||
|
|
||||||
raw_data = np.array(audio_segment.get_array_of_samples())
|
try:
|
||||||
|
audio_segment = AudioSegment.from_file(audio_buffer, format="ogg")
|
||||||
|
|
||||||
if audio_segment.channels > 1:
|
wav_io = io.BytesIO()
|
||||||
raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0]
|
audio_segment.export(wav_io, format="wav")
|
||||||
|
wav_io.seek(0)
|
||||||
|
except:
|
||||||
|
print("audio to wav failed")
|
||||||
|
|
||||||
audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1))
|
model_size = "base"
|
||||||
|
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||||
|
|
||||||
model = whisper.load_model("base")
|
segments, _ = model.transcribe(wav_io)
|
||||||
result = model.transcribe(audio_data)
|
transcription = ""
|
||||||
print(result["text"])
|
for segment in segments:
|
||||||
return result["text"]
|
transcription += segment.text + " "
|
||||||
|
result = transcription.strip()
|
||||||
|
print(result)
|
||||||
|
return result
|
||||||
|
|
||||||
# npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt
|
# npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt
|
Loading…
Reference in a new issue