interstellar_ai/py/voice.py

24 lines
787 B
Python

import io
import numpy as np
import whisper
from pydub import AudioSegment
class VoiceRecognition:
@staticmethod
def recognition(audio):
audio_data = audio.read()
with io.BytesIO(audio_data) as audio_buffer:
audio_segment = AudioSegment.from_ogg(audio_buffer)
raw_data = np.array(audio_segment.get_array_of_samples())
if audio_segment.channels > 1:
raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0]
audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1))
model = whisper.load_model("base")
result = model.transcribe(audio_data)
print(result["text"])
return result["text"]