forked from React-Group/interstellar_ai
		
	
		
			
				
	
	
		
			24 lines
		
	
	
	
		
			787 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			24 lines
		
	
	
	
		
			787 B
		
	
	
	
		
			Python
		
	
	
	
	
	
import io
 | 
						|
import numpy as np
 | 
						|
import whisper
 | 
						|
from pydub import AudioSegment
 | 
						|
 | 
						|
 | 
						|
class VoiceRecognition:
 | 
						|
    @staticmethod
 | 
						|
    def recognition(audio):
 | 
						|
        audio_data = audio.read()
 | 
						|
        with io.BytesIO(audio_data) as audio_buffer:
 | 
						|
            audio_segment = AudioSegment.from_ogg(audio_buffer)
 | 
						|
            
 | 
						|
            raw_data = np.array(audio_segment.get_array_of_samples())
 | 
						|
            
 | 
						|
            if audio_segment.channels > 1:
 | 
						|
                raw_data = raw_data.reshape((-1, audio_segment.channels))[:, 0]
 | 
						|
            
 | 
						|
            audio_data = raw_data.astype(np.float32) / (2 ** (audio_segment.sample_width * 8 - 1))
 | 
						|
        
 | 
						|
        model = whisper.load_model("base")
 | 
						|
        result = model.transcribe(audio_data)
 | 
						|
        print(result["text"])
 | 
						|
        return result["text"]
 |