diff --git a/transcribe.py b/transcribe.py index f3ddd85..e872ef5 100644 --- a/transcribe.py +++ b/transcribe.py @@ -3,13 +3,17 @@ import numpy as np import sounddevice as sd import queue import sys +import torch +from silero_vad import load_silero_vad, get_speech_timestamps # Parameters MODEL_TYPE = "tiny.en" CHANNELS = 1 SAMPLERATE = 16000 -BLOCK_SIZE = 8000 # 0.5 seconds of audio per block -TRANSCRIBE_RATE = 2 # Process every 2 seconds +BLOCK_SIZE = 512 # Silero VAD prefers specific block sizes (512, 1024, 1536) +VAD_THRESHOLD = 0.5 # Confidence threshold for speech +BUFFER_LIMIT = SAMPLERATE * 30 # Max 30 seconds of audio buffer +MIN_SILENCE_DURATION_MS = 500 # Silence duration to trigger transcription audio_queue = queue.Queue() @@ -20,42 +24,74 @@ def callback(indata, frames, time, status): def main(): print(f"Loading Whisper model '{MODEL_TYPE}'...") - model = whisper.load_model(MODEL_TYPE) - print("Model loaded.") + whisper_model = whisper.load_model(MODEL_TYPE) + + print("Loading Silero VAD model...") + vad_model = load_silero_vad() + + print("Models loaded.") print("\nAvailable Audio Devices:") devices = sd.query_devices() print(devices) - # Try to find a sensible default if the system one is tricky default_device = sd.default.device[0] print(f"\nUsing default input device index: {default_device}") - print("\nStarting live transcription... (Press Ctrl+C to stop)") - print("Note: On macOS, you may need to grant Microphone permissions to your terminal.\n") + print("\nStarting live transcription with VAD... (Press Ctrl+C to stop)") - audio_buffer = np.array([], dtype=np.float32) + audio_buffer = [] + speech_started = False try: with sd.InputStream(samplerate=SAMPLERATE, channels=CHANNELS, callback=callback, blocksize=BLOCK_SIZE): while True: - # Pull all available data from the queue while not audio_queue.empty(): data = audio_queue.get() - audio_buffer = np.append(audio_buffer, data.flatten()) + audio_buffer.append(data.flatten()) - # If we have enough audio, transcribe it - if len(audio_buffer) >= SAMPLERATE * TRANSCRIBE_RATE: - # Transcribe the current buffer - # fp16=False is used for CPU execution - result = model.transcribe(audio_buffer, fp16=False, language="en") - text = result['text'].strip() + if len(audio_buffer) > 0: + # Concatenate buffer to check for speech + current_audio = np.concatenate(audio_buffer) - if text: - print(f"Transcription: {text}") + # Convert to torch tensor for Silero + audio_tensor = torch.from_numpy(current_audio) - # Clear buffer for next chunk - audio_buffer = np.array([], dtype=np.float32) + # Get speech timestamps + speech_timestamps = get_speech_timestamps( + audio_tensor, + vad_model, + sampling_rate=SAMPLERATE, + threshold=VAD_THRESHOLD, + min_silence_duration_ms=MIN_SILENCE_DURATION_MS + ) + + # If we have speech and then silence, or buffer is getting too long + if len(speech_timestamps) > 0: + speech_started = True + + # Check if the last speech segment has "ended" (i.e., we have enough silence after it) + # or if we've reached a significant buffer size + last_end = speech_timestamps[-1]['end'] + buffer_len_samples = len(current_audio) + + # If the speech ended more than MIN_SILENCE_DURATION_MS ago + if (buffer_len_samples - last_end) > (SAMPLERATE * MIN_SILENCE_DURATION_MS / 1000) or buffer_len_samples > BUFFER_LIMIT: + + # Transcribe the valid speech segment + result = whisper_model.transcribe(current_audio, fp16=False, language="en") + text = result['text'].strip() + + if text: + print(f"Transcription: {text}") + + # Reset buffer + audio_buffer = [] + speech_started = False + + elif not speech_started and len(current_audio) > SAMPLERATE * 2: + # Clear buffer if it's just silence for more than 2 seconds + audio_buffer = [] except KeyboardInterrupt: print("\nStopped by user.")