feat: pivot to always-active Whisper captioning and trigger word detection

This commit is contained in:
Adolfo Reyna
2026-03-02 11:50:07 -05:00
parent de06244862
commit e7ec4d7ab2
2 changed files with 158 additions and 119 deletions

264
jarvis.py
View File

@@ -1,24 +1,51 @@
import sys
import os import os
import subprocess import subprocess
import openwakeword
from openwakeword.model import Model
import pyaudio
import numpy as np
import speech_recognition as sr
import time import time
import re import re
import queue
import threading
from unittest.mock import MagicMock
# Comprehensive workaround for missing _lzma in some Python builds
try:
import lzma
except ImportError:
mock_lzma = MagicMock()
mock_lzma.FORMAT_XZ = 1
mock_lzma.FORMAT_ALONE = 2
mock_lzma.FORMAT_RAW = 3
mock_lzma.CHECK_NONE = 0
mock_lzma.CHECK_CRC32 = 1
mock_lzma.CHECK_CRC64 = 4
mock_lzma.CHECK_SHA256 = 10
sys.modules["_lzma"] = MagicMock()
sys.modules["lzma"] = mock_lzma
import numpy as np
import sounddevice as sd
import torch
import mlx_whisper
from silero_vad import load_silero_vad, get_speech_timestamps
from gtts import gTTS from gtts import gTTS
import pygame import pygame
import io import io
import sys
# --- Configuration ---
TRIGGER_WORD = "Jarvis"
WHISPER_MODEL = "mlx-community/whisper-small-mlx"
SAMPLERATE = 16000
BLOCK_SIZE = 512
VAD_THRESHOLD = 0.5
SILENCE_DURATION_MS = 1000
MAX_BUFFER_SECONDS = 20
CONTEXT_CHARS = 500 # How much previous text to keep for context
# Configuration
WAKE_WORD = "hey_jarvis"
SENSITIVITY = 0.5
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
USE_GTTS = False
WORKSPACE_DIR = "workspace" WORKSPACE_DIR = "workspace"
SOUL_PATH = "soul.md" SOUL_PATH = "soul.md"
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
# Ensure workspace exists # Ensure workspace exists
if not os.path.exists(WORKSPACE_DIR): if not os.path.exists(WORKSPACE_DIR):
@@ -27,89 +54,66 @@ if not os.path.exists(WORKSPACE_DIR):
# Initialize pygame mixer for audio playback # Initialize pygame mixer for audio playback
pygame.mixer.init() pygame.mixer.init()
# Global state
audio_queue = queue.Queue()
rolling_context = ""
current_session_id = None
def play_sound(sound_path=SYSTEM_SOUND): def play_sound(sound_path=SYSTEM_SOUND):
"""Play a system sound asynchronously.""" """Play a system sound asynchronously."""
subprocess.Popen(["afplay", sound_path]) subprocess.Popen(["afplay", sound_path])
# Global session tracker
current_session_id = None
def get_latest_session_id(): def get_latest_session_id():
"""Retrieve the UUID of the most recent Gemini session.""" """Retrieve the UUID of the most recent Gemini session."""
try: try:
# Check sessions from the workspace context
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR) result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
# Find all UUIDs inside brackets [UUID]
matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout) matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout)
if matches: if matches:
# Return the last one in the list
return matches[-1] return matches[-1]
except Exception as e: except Exception as e:
print(f"Error fetching session ID: {e}") print(f"Error fetching session ID: {e}")
return None return None
def speak_text(text): def speak_text(text):
"""Speak text using the 'say' command (default) or gTTS if configured.""" """Speak text using the 'say' command."""
if not text or text.strip() == "": if not text or text.strip() == "":
return return
clean_text = text.replace("*", "").replace("#", "").replace("`", "") clean_text = text.replace("*", "").replace("#", "").replace("`", "")
print(f"[Jarvis] Speaking: {clean_text}")
if USE_GTTS:
try:
tts = gTTS(text=clean_text, lang='en')
fp = io.BytesIO()
tts.write_to_fp(fp)
fp.seek(0)
pygame.mixer.music.load(fp)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
return
except Exception as e:
print(f"Error in gTTS: {e}. Falling back to 'say'.")
subprocess.run(["say", clean_text]) subprocess.run(["say", clean_text])
def run_gemini(command, is_init=False): def run_gemini(command, context="", is_init=False):
"""Call the gemini CLI, capture output, and speak it.""" """Call the gemini CLI, capture output, and speak it."""
global current_session_id global current_session_id
args = ["gemini", "--prompt", command, "--yolo"] # Combine context and command if provided
full_prompt = command
if context:
full_prompt = f"Recent Context: {context}\n\nUser Command: {command}"
args = ["gemini", "--prompt", full_prompt, "--yolo"]
if current_session_id: if current_session_id:
args.extend(["--resume", current_session_id]) args.extend(["--resume", current_session_id])
# Set up environment for Gemini CLI
env = os.environ.copy() env = os.environ.copy()
if os.path.exists(SOUL_PATH): if os.path.exists(SOUL_PATH):
with open(SOUL_PATH, 'r') as f: with open(SOUL_PATH, 'r') as f:
soul_content = f.read() soul_content = f.read()
# Inject date/time context only on initialization
if is_init: if is_init:
current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") current_time = time.strftime("%A, %B %d, %Y, %I:%M %p")
soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content
print(f"\n[Jarvis] Initializing system protocol with temporal context...") print(f"\n[Jarvis] Initializing system protocol...")
else:
print(f"\n[Jarvis] Communicating with Gemini...")
# Use a temporary file for the system instruction
system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md")) system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md"))
with open(system_md_path, 'w') as f: with open(system_md_path, 'w') as f:
f.write(soul_content) f.write(soul_content)
env["GEMINI_SYSTEM_MD"] = system_md_path env["GEMINI_SYSTEM_MD"] = system_md_path
else:
if is_init:
print(f"\n[Jarvis] Initializing system protocol...")
else:
print(f"\n[Jarvis] Communicating with Gemini...")
print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")
try: try:
# All Gemini commands run inside the workspace directory
process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env) process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env)
response = process.stdout.strip() response = process.stdout.strip()
@@ -126,76 +130,108 @@ def run_gemini(command, is_init=False):
except Exception as e: except Exception as e:
print(f"Error running gemini: {e}") print(f"Error running gemini: {e}")
# --- Startup Sequence --- def audio_callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
audio_queue.put(indata.copy())
model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx") def main():
global rolling_context
print("[Jarvis] Loading models...")
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"[Jarvis] Using device: {device}")
vad_model = load_silero_vad()
print("[Jarvis] Models loaded.")
CHUNK = 1280 print("[Jarvis] Booting system protocols...")
FORMAT = pyaudio.paInt16 run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True)
CHANNELS = 1
RATE = 16000
audio = pyaudio.PyAudio() print(f"[Jarvis] Always-active mic enabled. Listening for '{TRIGGER_WORD}'...")
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
audio_buffer = []
speech_started = False
last_change_time = time.time()
try:
with sd.InputStream(samplerate=SAMPLERATE, channels=1, callback=audio_callback, blocksize=BLOCK_SIZE):
while True:
while not audio_queue.empty():
data = audio_queue.get()
audio_buffer.append(data.flatten())
recognizer = sr.Recognizer() if len(audio_buffer) > 0:
recognizer.pause_threshold = 1.2 current_audio = np.concatenate(audio_buffer)
recognizer.non_speaking_duration = 0.5 audio_tensor = torch.from_numpy(current_audio)
mic = sr.Microphone() buffer_duration = len(current_audio) / SAMPLERATE
speech_timestamps = get_speech_timestamps(
audio_tensor,
vad_model,
sampling_rate=SAMPLERATE,
threshold=VAD_THRESHOLD,
min_silence_duration_ms=SILENCE_DURATION_MS
)
print("[Jarvis] Calibrating for ambient noise...") # Watchdog to prevent buffer bloat
with mic as source: if buffer_duration > MAX_BUFFER_SECONDS:
recognizer.adjust_for_ambient_noise(source, duration=1) print("[Jarvis] Buffer limit reached. Resetting...")
audio_buffer = []
speech_started = False
continue
print("[Jarvis] Booting system protocols...") if len(speech_timestamps) > 0:
current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") speech_started = True
run_gemini(f"System initialization complete. The current date and time is {current_time}. Awaiting orders, Sir.", is_init=True) last_end = speech_timestamps[-1]['end']
buffer_len_samples = len(current_audio)
print(f"Listening for '{WAKE_WORD}'...")
try:
while True:
data = stream.read(CHUNK, exception_on_overflow=False)
audio_frame = np.frombuffer(data, dtype=np.int16)
prediction = model.predict(audio_frame)
if prediction[WAKE_WORD] > SENSITIVITY:
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
stream.stop_stream()
in_conversation = True
first_listening = True
while in_conversation:
play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
print("[Jarvis] Listening...")
with mic as source:
try:
audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
print("[Jarvis] Transcribing...")
command = recognizer.recognize_google(audio_cmd)
print(f"[Jarvis] You said: {command}")
run_gemini(command) # Check if speech has ended (silence after last speech)
first_listening = False if (buffer_len_samples - last_end) > (SAMPLERATE * SILENCE_DURATION_MS / 1000):
except sr.WaitTimeoutError: # Transcribe
print("[Jarvis] Session timed out.") print("[Jarvis] Transcribing...")
in_conversation = False result = mlx_whisper.transcribe(current_audio, path_or_hf_repo=WHISPER_MODEL)
except sr.UnknownValueError: text = result['text'].strip()
print("[Jarvis] No speech detected. Ending session.")
in_conversation = False if text:
except sr.RequestError as e: print(f"[Caption]: {text}")
print(f"[Jarvis] Speech service error: {e}")
in_conversation = False # Detect Trigger Word
trigger_match = re.search(rf"\b{TRIGGER_WORD}\b", text, re.IGNORECASE)
stream.start_stream() if trigger_match:
print(f"\nListening for '{WAKE_WORD}'...") print(f"[Jarvis] Trigger word detected!")
play_sound(SYSTEM_SOUND)
# Extract command (text after trigger word)
start_idx = trigger_match.end()
command = text[start_idx:].strip()
if not command:
print("[Jarvis] No command following trigger word. Using full text.")
command = text
# Call Gemini with context
run_gemini(command, context=rolling_context)
# Update context with this exchange
rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip()
else:
# Update context with current transcription
rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip()
except KeyboardInterrupt: # Reset buffer after processing
print("\nStopping...") audio_buffer = []
finally: speech_started = False
stream.stop_stream()
stream.close() elif not speech_started and len(current_audio) > SAMPLERATE * 2:
audio.terminate() # Clear buffer if no speech detected for 2 seconds
audio_buffer = []
except KeyboardInterrupt:
print("\n[Jarvis] Shutting down...")
except Exception as e:
print(f"\n[Jarvis] Fatal Error: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,9 @@
openwakeword mlx-whisper
pyaudio sounddevice
requests torch
SpeechRecognition silero-vad
transformers
numpy numpy
onnxruntime requests
pygame
gTTS