From 9663f49dee7b233e4327950145163f4785886b23 Mon Sep 17 00:00:00 2001 From: YasinOnm08 Date: Fri, 27 Sep 2024 08:03:12 +0200 Subject: [PATCH 1/4] copy feedback fix? --- app/components/ConversationFrontend.tsx | 2 +- app/components/InputFrontend.tsx | 8 -------- app/styles/output.css | 4 ++++ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/app/components/ConversationFrontend.tsx b/app/components/ConversationFrontend.tsx index e6bfe39..dc3114c 100644 --- a/app/components/ConversationFrontend.tsx +++ b/app/components/ConversationFrontend.tsx @@ -58,7 +58,7 @@ const ConversationFrontend = React.forwardRef -

Copied!

+

Copied!

diff --git a/app/components/InputFrontend.tsx b/app/components/InputFrontend.tsx index c84124b..74539e6 100644 --- a/app/components/InputFrontend.tsx +++ b/app/components/InputFrontend.tsx @@ -13,10 +13,6 @@ const InputFrontend = React.forwardRef( ({ message, onSendClick, onMicClick, inputDisabled, isRecording}, ref: ForwardedRef) => { const [inputValue, setInputValue] = useState(''); - useEffect(() => { - setInputValue(message); - }, [message]); - const handleInputChange = (e: React.ChangeEvent) => { setInputValue(e.target.value); }; @@ -31,10 +27,6 @@ const InputFrontend = React.forwardRef( } }; - const styles = { - - } - return (
Date: Fri, 27 Sep 2024 10:57:18 +0200 Subject: [PATCH 2/4] voice recognition progress 1 --- app/backend/InputOutputHandler.tsx | 56 +++++++++++++++++++++++++----- app/layout.tsx | 2 +- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/app/backend/InputOutputHandler.tsx b/app/backend/InputOutputHandler.tsx index 1316901..fa8dcf6 100644 --- a/app/backend/InputOutputHandler.tsx +++ b/app/backend/InputOutputHandler.tsx @@ -1,11 +1,13 @@ "use client" -import React, { useEffect, useRef, useState } from "react"; +import React, { use, useEffect, useRef, useState } from "react"; import ConversationFrontend from "../components/ConversationFrontend"; import InputFrontend from "../components/InputFrontend"; import VoiceSend from "./voice_backend" import { AudioRecorder } from "./AudioRecorder"; import axios from "axios"; import { resolve } from "path"; +import { FFmpeg } from "@ffmpeg/ffmpeg"; +import { fetchFile, toBlobURL } from "@ffmpeg/util" const InputOutputBackend: React.FC = () => { @@ -125,6 +127,17 @@ const InputOutputBackend: React.FC = () => { }); }; + /* Variables for System-prompt */ + const [preferredCurrency, setPreferredCurrency] = useState(localStorage.getItem("preferredCurrency") || "") + const [preferredLanguage, setPreferredLanguage] = useState(localStorage.getItem("preferredLanguage") || "") + const [timeFormat, setTimeFormat] = useState(localStorage.getItem("timeFormat") || "") + const [preferredMeasurement, setPreferredMeasurement] = useState(localStorage.getItem("preferredMeasurement") || "") + const [timeZone, setTimeZone] = useState(localStorage.getItem("timeZone") || "") + const [dateFormat, setDateFormat] = useState(localStorage.getItem("dateFormat") || "") + + useEffect(() => { + + },[preferredCurrency, preferredLanguage, timeFormat, preferredMeasurement, timeZone, dateFormat]) const addMessage = (role: string, content: string) => { setMessages(previous => [...previous, { role, content }]) @@ -152,25 +165,52 @@ const InputOutputBackend: React.FC = () => { audioChunks.current.push(event.data) } - mediaRecorder.onstop = () => { + mediaRecorder.onstop = async () => { const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) + console.log(audioBlob); const url = URL.createObjectURL(audioBlob) + const audio = new Audio(url); + audio.play().catch(error => console.error("Error playing audio:", error)); console.log(url); setAudioURL(url) audioChunks.current = [] + const wavBlob = await convertOggToWav(audioBlob) const remote = new VoiceSend() - remote.sendToVoiceRecognition(audioBlob,) + remote.sendToVoiceRecognition(wavBlob) } mediaRecorder.start() setIsRecording(true) - + } + + const ffmpegRef = useRef(null) + const audioRef = useRef("") + + const loadFFmpeg = async () => { + if (!ffmpegRef.current) { + ffmpegRef.current = new FFmpeg() + await ffmpegRef.current.load() } + } + + const convertOggToWav = async (oggFile: File | Blob) => { + await loadFFmpeg() + + const ffmpeg = ffmpegRef.current! + + await ffmpeg.writeFile("input.ogg", await fetchFile(oggFile)) + await ffmpeg.exec(["-i", "input.ogg", "output.wav"]) + const wavData = await ffmpeg.readFile("output.wav") + console.log(wavData); + const wavBlob = new Blob([wavData], { type: "audio/wav" }) + audioRef.current = URL.createObjectURL(wavBlob) + return wavBlob + } - const stopRecording = () => { - mediaRecorderRef.current?.stop() - setIsRecording(false) - } + const stopRecording = () => { + mediaRecorderRef.current?.stop() + setIsRecording(false) + } const handleMicClick = () => { diff --git a/app/layout.tsx b/app/layout.tsx index 8070a08..0896d49 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -12,7 +12,7 @@ export default function RootLayout({ children }: { children: ReactNode }) { {metadata.title} {/* Tried adding the favicon here */} - +
{children}
From 8090ce969e00d3769cadcc77d58f5fac0328d7b8 Mon Sep 17 00:00:00 2001 From: YasinOnm08 Date: Fri, 27 Sep 2024 13:59:27 +0200 Subject: [PATCH 3/4] voice recognition kinda works?? --- app/backend/InputOutputHandler.tsx | 23 +++++++------------ py/api.py | 2 +- py/voice.py | 36 +++++++++++++++++------------- 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/app/backend/InputOutputHandler.tsx b/app/backend/InputOutputHandler.tsx index fa8dcf6..83c998a 100644 --- a/app/backend/InputOutputHandler.tsx +++ b/app/backend/InputOutputHandler.tsx @@ -167,16 +167,14 @@ const InputOutputBackend: React.FC = () => { mediaRecorder.onstop = async () => { const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) - console.log(audioBlob); - const url = URL.createObjectURL(audioBlob) - const audio = new Audio(url); - audio.play().catch(error => console.error("Error playing audio:", error)); - console.log(url); - setAudioURL(url) audioChunks.current = [] - const wavBlob = await convertOggToWav(audioBlob) + // console.log(audioBlob); + // const url = URL.createObjectURL(audioBlob) + // const audio = new Audio(url); + // audio.play().catch(error => console.error("Error playing audio:", error)); + const remote = new VoiceSend() - remote.sendToVoiceRecognition(wavBlob) + remote.sendToVoiceRecognition(audioBlob) } mediaRecorder.start() @@ -276,14 +274,9 @@ const InputOutputBackend: React.FC = () => { onMicClick={handleMicClick} inputDisabled={inputDisabled} isRecording={isRecording} - /> + />
) } -export default InputOutputBackend - - - - - +export default InputOutputBackend \ No newline at end of file diff --git a/py/api.py b/py/api.py index c43bf1f..f3e13ea 100644 --- a/py/api.py +++ b/py/api.py @@ -1,4 +1,4 @@ -from flask i mport Flask, request, jsonify +from flask import Flask, request, jsonify from flask_cors import CORS import secrets import threading diff --git a/py/voice.py b/py/voice.py index 461da21..dc0d28b 100644 --- a/py/voice.py +++ b/py/voice.py @@ -1,26 +1,32 @@ import io import numpy as np -import whisper +from faster_whisper import WhisperModel from pydub import AudioSegment class VoiceRecognition: @staticmethod def recognition(audio): - audio_data = audio.read() - with io.BytesIO(audio_data) as audio_buffer: - audio_segment = AudioSegment.from_ogg(audio_buffer) - - raw_data = np.array(audio_segment.get_array_of_samples()) - - if audio_segment.channels > 1: - raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] - - audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) + audio_buffer = io.BytesIO(audio.read()) - model = whisper.load_model("base") - result = model.transcribe(audio_data) - print(result["text"]) - return result["text"] + try: + audio_segment = AudioSegment.from_file(audio_buffer, format="ogg") + + wav_io = io.BytesIO() + audio_segment.export(wav_io, format="wav") + wav_io.seek(0) + except: + print("audio to wav failed") + + model_size = "base" + model = WhisperModel(model_size, device="cpu", compute_type="int8") + + segments, _ = model.transcribe(wav_io) + transcription = "" + for segment in segments: + transcription += segment.text + " " + result = transcription.strip() + print(result) + return result # npm install @ffmpeg/ffmpeg @ffmpeg/util @ffmpeg/types @ffmpeg/core-mt \ No newline at end of file From 015e9a1064170661392a5b8875042085b0abeba9 Mon Sep 17 00:00:00 2001 From: YasinOnm08 Date: Fri, 27 Sep 2024 14:03:43 +0200 Subject: [PATCH 4/4] ... --- app/layout.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/layout.tsx b/app/layout.tsx index 0896d49..0425420 100644 --- a/app/layout.tsx +++ b/app/layout.tsx @@ -12,7 +12,7 @@ export default function RootLayout({ children }: { children: ReactNode }) { {metadata.title} {/* Tried adding the favicon here */} - +
{children}