import whisper import numpy as np import sounddevice as sd import queue import sys # Parameters MODEL_TYPE = "tiny.en" CHANNELS = 1 SAMPLERATE = 16000 BLOCK_SIZE = 8000 # 0.5 seconds of audio per block TRANSCRIBE_RATE = 2 # Process every 2 seconds audio_queue = queue.Queue() def callback(indata, frames, time, status): if status: print(status, file=sys.stderr) audio_queue.put(indata.copy()) def main(): print(f"Loading Whisper model '{MODEL_TYPE}'...") model = whisper.load_model(MODEL_TYPE) print("Model loaded.") print("\nAvailable Audio Devices:") devices = sd.query_devices() print(devices) # Try to find a sensible default if the system one is tricky default_device = sd.default.device[0] print(f"\nUsing default input device index: {default_device}") print("\nStarting live transcription... (Press Ctrl+C to stop)") print("Note: On macOS, you may need to grant Microphone permissions to your terminal.\n") audio_buffer = np.array([], dtype=np.float32) try: with sd.InputStream(samplerate=SAMPLERATE, channels=CHANNELS, callback=callback, blocksize=BLOCK_SIZE): while True: # Pull all available data from the queue while not audio_queue.empty(): data = audio_queue.get() audio_buffer = np.append(audio_buffer, data.flatten()) # If we have enough audio, transcribe it if len(audio_buffer) >= SAMPLERATE * TRANSCRIBE_RATE: # Transcribe the current buffer # fp16=False is used for CPU execution result = model.transcribe(audio_buffer, fp16=False, language="en") text = result['text'].strip() if text: print(f"Transcription: {text}") # Clear buffer for next chunk audio_buffer = np.array([], dtype=np.float32) except KeyboardInterrupt: print("\nStopped by user.") except Exception as e: print(f"\nError: {e}") if __name__ == "__main__": main()