From e7ec4d7ab2531e6d9f60459c67243a675791976d Mon Sep 17 00:00:00 2001 From: Adolfo Reyna Date: Mon, 2 Mar 2026 11:50:07 -0500 Subject: [PATCH] feat: pivot to always-active Whisper captioning and trigger word detection --- jarvis.py | 264 +++++++++++++++++++++++++++-------------------- requirements.txt | 13 ++- 2 files changed, 158 insertions(+), 119 deletions(-) diff --git a/jarvis.py b/jarvis.py index 400d773..7727bc2 100644 --- a/jarvis.py +++ b/jarvis.py @@ -1,24 +1,51 @@ +import sys import os import subprocess -import openwakeword -from openwakeword.model import Model -import pyaudio -import numpy as np -import speech_recognition as sr import time import re +import queue +import threading +from unittest.mock import MagicMock + +# Comprehensive workaround for missing _lzma in some Python builds +try: + import lzma +except ImportError: + mock_lzma = MagicMock() + mock_lzma.FORMAT_XZ = 1 + mock_lzma.FORMAT_ALONE = 2 + mock_lzma.FORMAT_RAW = 3 + mock_lzma.CHECK_NONE = 0 + mock_lzma.CHECK_CRC32 = 1 + mock_lzma.CHECK_CRC64 = 4 + mock_lzma.CHECK_SHA256 = 10 + sys.modules["_lzma"] = MagicMock() + sys.modules["lzma"] = mock_lzma + +import numpy as np +import sounddevice as sd +import torch +import mlx_whisper +from silero_vad import load_silero_vad, get_speech_timestamps from gtts import gTTS import pygame import io +import sys + +# --- Configuration --- +TRIGGER_WORD = "Jarvis" +WHISPER_MODEL = "mlx-community/whisper-small-mlx" +SAMPLERATE = 16000 +BLOCK_SIZE = 512 +VAD_THRESHOLD = 0.5 +SILENCE_DURATION_MS = 1000 +MAX_BUFFER_SECONDS = 20 +CONTEXT_CHARS = 500 # How much previous text to keep for context -# Configuration -WAKE_WORD = "hey_jarvis" -SENSITIVITY = 0.5 -SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" -FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff" -USE_GTTS = False WORKSPACE_DIR = "workspace" SOUL_PATH = "soul.md" +SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" +FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff" # Ensure workspace exists if not os.path.exists(WORKSPACE_DIR): @@ -27,89 +54,66 @@ if not os.path.exists(WORKSPACE_DIR): # Initialize pygame mixer for audio playback pygame.mixer.init() +# Global state +audio_queue = queue.Queue() +rolling_context = "" +current_session_id = None + def play_sound(sound_path=SYSTEM_SOUND): """Play a system sound asynchronously.""" subprocess.Popen(["afplay", sound_path]) -# Global session tracker -current_session_id = None - def get_latest_session_id(): """Retrieve the UUID of the most recent Gemini session.""" try: - # Check sessions from the workspace context result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR) - # Find all UUIDs inside brackets [UUID] matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout) if matches: - # Return the last one in the list return matches[-1] except Exception as e: print(f"Error fetching session ID: {e}") return None def speak_text(text): - """Speak text using the 'say' command (default) or gTTS if configured.""" + """Speak text using the 'say' command.""" if not text or text.strip() == "": return clean_text = text.replace("*", "").replace("#", "").replace("`", "") - - if USE_GTTS: - try: - tts = gTTS(text=clean_text, lang='en') - fp = io.BytesIO() - tts.write_to_fp(fp) - fp.seek(0) - pygame.mixer.music.load(fp) - pygame.mixer.music.play() - while pygame.mixer.music.get_busy(): - pygame.time.Clock().tick(10) - return - except Exception as e: - print(f"Error in gTTS: {e}. Falling back to 'say'.") - + print(f"[Jarvis] Speaking: {clean_text}") subprocess.run(["say", clean_text]) -def run_gemini(command, is_init=False): +def run_gemini(command, context="", is_init=False): """Call the gemini CLI, capture output, and speak it.""" global current_session_id - args = ["gemini", "--prompt", command, "--yolo"] + # Combine context and command if provided + full_prompt = command + if context: + full_prompt = f"Recent Context: {context}\n\nUser Command: {command}" + + args = ["gemini", "--prompt", full_prompt, "--yolo"] if current_session_id: args.extend(["--resume", current_session_id]) - # Set up environment for Gemini CLI env = os.environ.copy() if os.path.exists(SOUL_PATH): with open(SOUL_PATH, 'r') as f: soul_content = f.read() - # Inject date/time context only on initialization if is_init: current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content - print(f"\n[Jarvis] Initializing system protocol with temporal context...") - else: - print(f"\n[Jarvis] Communicating with Gemini...") + print(f"\n[Jarvis] Initializing system protocol...") - # Use a temporary file for the system instruction system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md")) with open(system_md_path, 'w') as f: f.write(soul_content) env["GEMINI_SYSTEM_MD"] = system_md_path - else: - if is_init: - print(f"\n[Jarvis] Initializing system protocol...") - else: - print(f"\n[Jarvis] Communicating with Gemini...") - print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}") - try: - # All Gemini commands run inside the workspace directory process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env) response = process.stdout.strip() @@ -126,76 +130,108 @@ def run_gemini(command, is_init=False): except Exception as e: print(f"Error running gemini: {e}") -# --- Startup Sequence --- +def audio_callback(indata, frames, time, status): + if status: + print(status, file=sys.stderr) + audio_queue.put(indata.copy()) -model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx") +def main(): + global rolling_context + + print("[Jarvis] Loading models...") + device = "mps" if torch.backends.mps.is_available() else "cpu" + print(f"[Jarvis] Using device: {device}") + + vad_model = load_silero_vad() + print("[Jarvis] Models loaded.") -CHUNK = 1280 -FORMAT = pyaudio.paInt16 -CHANNELS = 1 -RATE = 16000 + print("[Jarvis] Booting system protocols...") + run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True) -audio = pyaudio.PyAudio() -stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) + print(f"[Jarvis] Always-active mic enabled. Listening for '{TRIGGER_WORD}'...") + + audio_buffer = [] + speech_started = False + last_change_time = time.time() + + try: + with sd.InputStream(samplerate=SAMPLERATE, channels=1, callback=audio_callback, blocksize=BLOCK_SIZE): + while True: + while not audio_queue.empty(): + data = audio_queue.get() + audio_buffer.append(data.flatten()) -recognizer = sr.Recognizer() -recognizer.pause_threshold = 1.2 -recognizer.non_speaking_duration = 0.5 -mic = sr.Microphone() + if len(audio_buffer) > 0: + current_audio = np.concatenate(audio_buffer) + audio_tensor = torch.from_numpy(current_audio) + buffer_duration = len(current_audio) / SAMPLERATE + + speech_timestamps = get_speech_timestamps( + audio_tensor, + vad_model, + sampling_rate=SAMPLERATE, + threshold=VAD_THRESHOLD, + min_silence_duration_ms=SILENCE_DURATION_MS + ) -print("[Jarvis] Calibrating for ambient noise...") -with mic as source: - recognizer.adjust_for_ambient_noise(source, duration=1) + # Watchdog to prevent buffer bloat + if buffer_duration > MAX_BUFFER_SECONDS: + print("[Jarvis] Buffer limit reached. Resetting...") + audio_buffer = [] + speech_started = False + continue -print("[Jarvis] Booting system protocols...") -current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") -run_gemini(f"System initialization complete. The current date and time is {current_time}. Awaiting orders, Sir.", is_init=True) - -print(f"Listening for '{WAKE_WORD}'...") - -try: - while True: - data = stream.read(CHUNK, exception_on_overflow=False) - audio_frame = np.frombuffer(data, dtype=np.int16) - prediction = model.predict(audio_frame) - - if prediction[WAKE_WORD] > SENSITIVITY: - print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})") - stream.stop_stream() - - in_conversation = True - first_listening = True - - while in_conversation: - play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND) - print("[Jarvis] Listening...") - - with mic as source: - try: - audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15) - print("[Jarvis] Transcribing...") - command = recognizer.recognize_google(audio_cmd) - print(f"[Jarvis] You said: {command}") + if len(speech_timestamps) > 0: + speech_started = True + last_end = speech_timestamps[-1]['end'] + buffer_len_samples = len(current_audio) - run_gemini(command) - first_listening = False - - except sr.WaitTimeoutError: - print("[Jarvis] Session timed out.") - in_conversation = False - except sr.UnknownValueError: - print("[Jarvis] No speech detected. Ending session.") - in_conversation = False - except sr.RequestError as e: - print(f"[Jarvis] Speech service error: {e}") - in_conversation = False - - stream.start_stream() - print(f"\nListening for '{WAKE_WORD}'...") + # Check if speech has ended (silence after last speech) + if (buffer_len_samples - last_end) > (SAMPLERATE * SILENCE_DURATION_MS / 1000): + + # Transcribe + print("[Jarvis] Transcribing...") + result = mlx_whisper.transcribe(current_audio, path_or_hf_repo=WHISPER_MODEL) + text = result['text'].strip() + + if text: + print(f"[Caption]: {text}") + + # Detect Trigger Word + trigger_match = re.search(rf"\b{TRIGGER_WORD}\b", text, re.IGNORECASE) + if trigger_match: + print(f"[Jarvis] Trigger word detected!") + play_sound(SYSTEM_SOUND) + + # Extract command (text after trigger word) + start_idx = trigger_match.end() + command = text[start_idx:].strip() + + if not command: + print("[Jarvis] No command following trigger word. Using full text.") + command = text + + # Call Gemini with context + run_gemini(command, context=rolling_context) + + # Update context with this exchange + rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip() + else: + # Update context with current transcription + rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip() -except KeyboardInterrupt: - print("\nStopping...") -finally: - stream.stop_stream() - stream.close() - audio.terminate() + # Reset buffer after processing + audio_buffer = [] + speech_started = False + + elif not speech_started and len(current_audio) > SAMPLERATE * 2: + # Clear buffer if no speech detected for 2 seconds + audio_buffer = [] + + except KeyboardInterrupt: + print("\n[Jarvis] Shutting down...") + except Exception as e: + print(f"\n[Jarvis] Fatal Error: {e}") + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 737cc00..fae0786 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ -openwakeword -pyaudio -requests -SpeechRecognition +mlx-whisper +sounddevice +torch +silero-vad +transformers numpy -onnxruntime +requests +pygame +gTTS