interstellar_ai/py/voice.py

31 lines
917 B
Python
Raw Normal View History

2024-09-26 13:08:29 +02:00
import io
2024-09-27 13:59:27 +02:00
from faster_whisper import WhisperModel
2024-09-26 13:08:29 +02:00
from pydub import AudioSegment
2024-09-24 09:55:23 +02:00
class VoiceRecognition:
2024-09-26 11:01:15 +02:00
@staticmethod
def recognition(audio):
2024-09-27 13:59:27 +02:00
audio_buffer = io.BytesIO(audio.read())
try:
audio_segment = AudioSegment.from_file(audio_buffer, format="ogg")
2024-09-26 13:08:29 +02:00
2024-09-27 13:59:27 +02:00
wav_io = io.BytesIO()
audio_segment.export(wav_io, format="wav")
wav_io.seek(0)
except:
print("audio to wav failed")
model_size = "base"
2024-09-30 09:35:58 +02:00
model = WhisperModel(model_size, device="cpu", compute_type=" ")
2024-09-26 13:08:29 +02:00
2024-09-27 13:59:27 +02:00
segments, _ = model.transcribe(wav_io)
transcription = ""
for segment in segments:
transcription += segment.text + " "
result = transcription.strip()
print(result)
return result
2024-09-26 16:01:40 +02:00
# npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt