Merge pull request 'voice recognition with whisper' (#40) from YasinOnm08/ai-virtual-assistant:main into main

Reviewed-on: https://interstellardevelopment.org/code/code/React-Group/ai-virtual-assistant/pulls/40
This commit is contained in:
Patrick 2024-09-18 09:19:51 +02:00
commit de12ebfa03

View file

@ -1,31 +1,61 @@
import speech_recognition as sr
#pip install faster-whisper
from api import API
import os
import wave
from faster_whisper import WhisperModel
import pyaudio
class Voice: #create Class
@staticmethod
def listen(): #define function listen()
recognizer = sr.Recognizer()
try:
with sr.Microphone() as source:
print("Adjusting for ambient noise...")
recognizer.adjust_for_ambient_noise(source, duration=0.5) #listen to surrounding for .5sec to adjust backgroundnoise
print("Listening...")
audio_data = recognizer.listen(source) #listen to user until user stops speaking
print("Audio captured")
try:
text = recognizer.recognize_sphinx(audio_data) # Using Sphinx convert audio to text (also works offline)
#if any Exceptions or Errors eccur => return ERROR
except sr.UnknownValueError:
text = "ERROR"
except sr.RequestError as e:
text = "ERROR"
except sr.RequestError as e:
text = "ERROR"
except sr.UnknownValueError:
text = "ERROR"
except Exception as e:
text = "ERROR"
return text
def transcribe_chunk(model, file_path):
print("transcribing...")
segments, _ = model.transcribe(file_path)
transcription = ""
for segment in segments:
transcription += segment.text + " "
return transcription.strip()
#Record voice
#chunk_length = duration to record in seconds
def record_chunk(p, stream, file_path, chunk_length=2):
frames=[]
for _ in range(0, int(16000/1024 * chunk_length)):
data = stream.read(1024)
frames.append(data)
wf = wave.open(file_path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000)
wf.writeframes(b''.join(frames))
wf.close()
def listen():
#model settings (tiny, base, small, medium, large)
model_size = "medium"
#what should it run on (cpu or cuda for gpu)
model = WhisperModel(model_size, device="cpu", compute_type="int8")
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate = 16000, input = True, frames_per_buffer=1024)
try:
while True:
print("Recording...")
# CHANGE TEMP FILE PATH
chunk_file="temp_chunk.wav"
record_chunk(p, stream, chunk_file)
transcription = transcribe_chunk(model, chunk_file)
print(transcription)
try:
return transcription
except Exception as e:
return "ERROR"
finally:
if os.path.exists(chunk_file):
os.remove(chunk_file)
break
except KeyboardInterrupt:
print("Stopping...")
finally:
stream.stop_stream()
stream.close()
p.terminate()