From ab12b796ec9a2c1fe16eb84fb4bf3ea64bc04c52 Mon Sep 17 00:00:00 2001 From: Patrick_Pluto Date: Thu, 26 Sep 2024 11:01:15 +0200 Subject: [PATCH] whisper is better --- py/api.py | 11 ++--------- py/voice.py | 48 +++++++----------------------------------------- 2 files changed, 9 insertions(+), 50 deletions(-) diff --git a/py/api.py b/py/api.py index 0951717..f3e13ea 100644 --- a/py/api.py +++ b/py/api.py @@ -99,16 +99,9 @@ class API: @self.app.route('/interstellar_ai/api/voice_recognition', methods=['POST']) def voice_recognition(): - print(request.args) - recog_type = request.form.get('type') - print(recog_type) audio = request.files.get('audio') - option = request.form.get('option') - if recog_type == "basic": - text = self.voice.basic_recognition(audio, option) - return jsonify({'status': 200, 'response': text}) - else: - return jsonify({'status': 401, 'response': "Invalid type"}) + text = self.voice.recognition(audio) + return jsonify({'status': 200, 'response': text}) @self.app.route('/interstellar_ai/api/weather', methods=['POST']) def get_weather(): diff --git a/py/voice.py b/py/voice.py index b9500da..b4516f0 100644 --- a/py/voice.py +++ b/py/voice.py @@ -1,44 +1,10 @@ -import speech_recognition as sr -from pydub import AudioSegment +import whisper class VoiceRecognition: - def check_audio_format(self, file_path): - try: - audio = AudioSegment.from_ogg(file_path) - print(f"Audio format: {audio.format}") - return True - except Exception as e: - print(f"Error reading audio file: {e}") - return False - - def basic_recognition(self, audio, option): - print(type(audio)) - print("preparing") - r = sr.Recognizer() - - # Read the data from the FileStorage object - audio_data = audio.read() - - # Write the audio data to a file - with open('output.wav', 'wb') as file: - file.write(audio_data) - - self.check_audio_format(audio) - if option == "online": - with sr.AudioFile(audio) as source: - print(type(source)) - print("online") - text = r.recognize_google_cloud(source) - print("recognized as: " + text) - return text - elif option == "offline": - with sr.AudioFile(audio) as source: - print(type(source)) - print("offline") - text = r.recognize_sphinx(source) - print("recognized as: " + text) - return text - - print("nothing") - return False + @staticmethod + def recognition(audio): + model = whisper.load_model("base") + result = model.transcribe(audio) + print(result["text"]) + return result["text"]