Initial commit: basic Whisper live transcription script

2026-02-26 20:53:39 -05:00
commit 0180b1f29e
3 changed files with 67 additions and 0 deletions
@@ -0,0 +1 @@
+__pycache__/\n*.pyc\n.DS_Store
@@ -0,0 +1,66 @@
+import whisper
+import numpy as np
+import sounddevice as sd
+import queue
+import sys
+
+# Parameters
+MODEL_TYPE = "tiny.en"
+CHANNELS = 1
+SAMPLERATE = 16000
+BLOCK_SIZE = 8000 # 0.5 seconds of audio per block
+TRANSCRIBE_RATE = 2 # Process every 2 seconds
+
+audio_queue = queue.Queue()
+
+def callback(indata, frames, time, status):
+    if status:
+        print(status, file=sys.stderr)
+    audio_queue.put(indata.copy())
+
+def main():
+    print(f"Loading Whisper model '{MODEL_TYPE}'...")
+    model = whisper.load_model(MODEL_TYPE)
+    print("Model loaded.")
+
+    print("\nAvailable Audio Devices:")
+    devices = sd.query_devices()
+    print(devices)
+    
+    # Try to find a sensible default if the system one is tricky
+    default_device = sd.default.device[0]
+    print(f"\nUsing default input device index: {default_device}")
+
+    print("\nStarting live transcription... (Press Ctrl+C to stop)")
+    print("Note: On macOS, you may need to grant Microphone permissions to your terminal.\n")
+    
+    audio_buffer = np.array([], dtype=np.float32)
+
+    try:
+        with sd.InputStream(samplerate=SAMPLERATE, channels=CHANNELS, callback=callback, blocksize=BLOCK_SIZE):
+            while True:
+                # Pull all available data from the queue
+                while not audio_queue.empty():
+                    data = audio_queue.get()
+                    audio_buffer = np.append(audio_buffer, data.flatten())
+
+                # If we have enough audio, transcribe it
+                if len(audio_buffer) >= SAMPLERATE * TRANSCRIBE_RATE:
+                    # Transcribe the current buffer
+                    # fp16=False is used for CPU execution
+                    result = model.transcribe(audio_buffer, fp16=False, language="en")
+                    text = result['text'].strip()
+                    
+                    if text:
+                        print(f"Transcription: {text}")
+                    
+                    # Clear buffer for next chunk
+                    audio_buffer = np.array([], dtype=np.float32)
+
+    except KeyboardInterrupt:
+        print("\nStopped by user.")
+    except Exception as e:
+        print(f"\nError: {e}")
+
+if __name__ == "__main__":
+    main()