ai-virtual-assistant/py/voice_recognition.py

#pip install faster-whisper
from api import API
import os
import wave
from faster_whisper import WhisperModel
import pyaudio

def transcribe_chunk(model, file_path):
    print("transcribing...")
    segments, _ = model.transcribe(file_path)
    transcription = ""
    for segment in segments:
        transcription += segment.text + " "
    return transcription.strip()

#Record voice
#chunk_length = duration to record in seconds
def record_chunk(p, stream, file_path, chunk_length=2):
    frames=[]
    for _ in range(0, int(16000/1024 * chunk_length)):
        data = stream.read(1024)
        frames.append(data)

    wf = wave.open(file_path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(16000)
    wf.writeframes(b''.join(frames))
    wf.close()
    
def listen():
    #model settings (tiny, base, small, medium, large)
    model_size = "medium"
    #what should it run on (cpu or cuda for gpu)
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate = 16000, input = True,  frames_per_buffer=1024)
    
    try:
        while True:
            print("Recording...")
            # CHANGE TEMP FILE PATH
            chunk_file="temp_chunk.wav"
            record_chunk(p, stream, chunk_file)
            transcription = transcribe_chunk(model, chunk_file)
            print(transcription)
            try:
                return transcription
            except Exception as e:
                return "ERROR"
            finally:
                if os.path.exists(chunk_file):
                    os.remove(chunk_file)
            break
    except KeyboardInterrupt:
        print("Stopping...")
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
voice recognition with whisper 2024-09-18 09:18:50 +02:00			`#pip install faster-whisper`
			`from api import API`
			`import os`
			`import wave`
			`from faster_whisper import WhisperModel`
			`import pyaudio`
voice_recognition 2024-09-16 14:38:45 +02:00
voice recognition with whisper 2024-09-18 09:18:50 +02:00			`def transcribe_chunk(model, file_path):`
			`print("transcribing...")`
			`segments, _ = model.transcribe(file_path)`
			`transcription = ""`
			`for segment in segments:`
			`transcription += segment.text + " "`
			`return transcription.strip()`
voice_recognition 2024-09-16 14:38:45 +02:00
voice recognition with whisper 2024-09-18 09:18:50 +02:00			`#Record voice`
			`#chunk_length = duration to record in seconds`
			`def record_chunk(p, stream, file_path, chunk_length=2):`
			`frames=[]`
			`for _ in range(0, int(16000/1024 * chunk_length)):`
			`data = stream.read(1024)`
			`frames.append(data)`
simple_chat, voice_recognition 2024-09-16 14:28:41 +02:00
voice recognition with whisper 2024-09-18 09:18:50 +02:00			`wf = wave.open(file_path, 'wb')`
			`wf.setnchannels(1)`
			`wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))`
			`wf.setframerate(16000)`
			`wf.writeframes(b''.join(frames))`
			`wf.close()`

			`def listen():`
			`#model settings (tiny, base, small, medium, large)`
			`model_size = "medium"`
			`#what should it run on (cpu or cuda for gpu)`
			`model = WhisperModel(model_size, device="cpu", compute_type="int8")`

			`p = pyaudio.PyAudio()`
			`stream = p.open(format=pyaudio.paInt16, channels=1, rate = 16000, input = True, frames_per_buffer=1024)`

			`try:`
			`while True:`
			`print("Recording...")`
			`# CHANGE TEMP FILE PATH`
			`chunk_file="temp_chunk.wav"`
			`record_chunk(p, stream, chunk_file)`
			`transcription = transcribe_chunk(model, chunk_file)`
			`print(transcription)`
			`try:`
			`return transcription`
			`except Exception as e:`
			`return "ERROR"`
			`finally:`
			`if os.path.exists(chunk_file):`
			`os.remove(chunk_file)`
			`break`
			`except KeyboardInterrupt:`
			`print("Stopping...")`
			`finally:`
			`stream.stop_stream()`
			`stream.close()`
			`p.terminate()`