2024-09-26 13:08:29 +02:00
|
|
|
import io
|
|
|
|
import numpy as np
|
2024-09-26 11:01:15 +02:00
|
|
|
import whisper
|
2024-09-26 13:08:29 +02:00
|
|
|
from pydub import AudioSegment
|
2024-09-24 09:55:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
class VoiceRecognition:
|
2024-09-26 11:01:15 +02:00
|
|
|
@staticmethod
|
|
|
|
def recognition(audio):
|
2024-09-26 13:08:29 +02:00
|
|
|
audio_data = audio.read()
|
|
|
|
with io.BytesIO(audio_data) as audio_buffer:
|
|
|
|
audio_segment = AudioSegment.from_ogg(audio_buffer)
|
|
|
|
|
|
|
|
raw_data = np.array(audio_segment.get_array_of_samples())
|
|
|
|
|
|
|
|
if audio_segment.channels > 1:
|
|
|
|
raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0]
|
|
|
|
|
|
|
|
audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1))
|
|
|
|
|
2024-09-26 11:01:15 +02:00
|
|
|
model = whisper.load_model("base")
|
2024-09-26 13:08:29 +02:00
|
|
|
result = model.transcribe(audio_data)
|
2024-09-26 11:01:15 +02:00
|
|
|
print(result["text"])
|
|
|
|
return result["text"]
|
2024-09-26 16:01:40 +02:00
|
|
|
|
|
|
|
# npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt
|