main #54
					 4 changed files with 18 additions and 7 deletions
				
			
		|  | @ -151,7 +151,7 @@ const InputOutputBackend: React.FC = () => { | ||||||
|     } |     } | ||||||
|          |          | ||||||
|     mediaRecorder.onstop = () => { |     mediaRecorder.onstop = () => { | ||||||
|       const audioBlob = new Blob(audioChunks.current, { type: "audio/wav" }) |       const audioBlob = new Blob(audioChunks.current, { type: "audio/ogg" }) | ||||||
|       const url = URL.createObjectURL(audioBlob) |       const url = URL.createObjectURL(audioBlob) | ||||||
|       console.log(url); |       console.log(url); | ||||||
|       setAudioURL(url) |       setAudioURL(url) | ||||||
|  |  | ||||||
|  | @ -4,13 +4,9 @@ import axios from "axios"; | ||||||
| class VoiceSend { | class VoiceSend { | ||||||
|     sendToVoiceRecognition(audio_data: Blob) { |     sendToVoiceRecognition(audio_data: Blob) { | ||||||
|         console.log("sending recording..."); |         console.log("sending recording..."); | ||||||
|         console.log(typeof (audio_data)); |  | ||||||
|         console.log(audio_data instanceof Blob); |  | ||||||
| 
 | 
 | ||||||
|         const formdata = new FormData() |         const formdata = new FormData() | ||||||
|         formdata.append("audio", audio_data) |         formdata.append("audio", audio_data) | ||||||
|         formdata.append("option", "offline") |  | ||||||
|         formdata.append("type", "basic") |  | ||||||
| 
 | 
 | ||||||
|         const dataSend = { option:"offline", type:"basic",audio:audio_data } |         const dataSend = { option:"offline", type:"basic",audio:audio_data } | ||||||
|         axios.post("http://localhost:5000/interstellar_ai/api/voice_recognition", formdata) |         axios.post("http://localhost:5000/interstellar_ai/api/voice_recognition", formdata) | ||||||
|  |  | ||||||
|  | @ -14,4 +14,5 @@ pycouchdb | ||||||
| pyttsx3 | pyttsx3 | ||||||
| pip-licenses | pip-licenses | ||||||
| openai-whisper | openai-whisper | ||||||
| pydub | pydub | ||||||
|  | ffmpeg | ||||||
							
								
								
									
										16
									
								
								py/voice.py
									
										
									
									
									
								
							
							
						
						
									
										16
									
								
								py/voice.py
									
										
									
									
									
								
							|  | @ -1,10 +1,24 @@ | ||||||
|  | import io | ||||||
|  | import numpy as np | ||||||
| import whisper | import whisper | ||||||
|  | from pydub import AudioSegment | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class VoiceRecognition: | class VoiceRecognition: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def recognition(audio): |     def recognition(audio): | ||||||
|  |         audio_data = audio.read() | ||||||
|  |         with io.BytesIO(audio_data) as audio_buffer: | ||||||
|  |             audio_segment = AudioSegment.from_ogg(audio_buffer) | ||||||
|  |              | ||||||
|  |             raw_data = np.array(audio_segment.get_array_of_samples()) | ||||||
|  |              | ||||||
|  |             if audio_segment.channels > 1: | ||||||
|  |                 raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0] | ||||||
|  |              | ||||||
|  |             audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1)) | ||||||
|  |          | ||||||
|         model = whisper.load_model("base") |         model = whisper.load_model("base") | ||||||
|         result = model.transcribe(audio) |         result = model.transcribe(audio_data) | ||||||
|         print(result["text"]) |         print(result["text"]) | ||||||
|         return result["text"] |         return result["text"] | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue