import io import numpy as np import whisper from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): audio_data = audio.read() with io.BytesIO(audio_data) as audio_buffer: audio_segment = AudioSegment.from_ogg(audio_buffer) raw_data = np.array(audio_segment.get_array_of_samples()) if audio_segment.channels > 1: raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) model = whisper.load_model("base") result = model.transcribe(audio_data) print(result["text"]) return result["text"] # npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt