import os import subprocess import openwakeword from openwakeword.model import Model import pyaudio import numpy as np import speech_recognition as sr import time import re from gtts import gTTS import pygame import io # Configuration WAKE_WORD = "hey_jarvis" SENSITIVITY = 0.5 SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" # Initialize pygame mixer for audio playback pygame.mixer.init() def play_sound(): """Play a system sound to indicate Jarvis is listening.""" subprocess.run(["afplay", SYSTEM_SOUND]) # Global session tracker current_session_id = None # Load the openWakeWord model using ONNX model = Model( wakeword_models=[WAKE_WORD], inference_framework="onnx" ) # Audio setup for openWakeWord CHUNK = 1280 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) # Speech recognition setup recognizer = sr.Recognizer() def get_latest_session_id(): """Retrieve the UUID of the most recent Gemini session.""" try: result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True) # Match UUID inside brackets in the first session line (e.g., [c16895c1-...]) match = re.search(r"1\..*?\[(.*?)\]", result.stdout) if match: return match.group(1) except Exception as e: print(f"Error fetching session ID: {e}") return None def speak_text(text): """Use Google Text-to-Speech (gTTS) for high-quality audio.""" if not text or text.strip() == "": return # Remove markdown for cleaner speech clean_text = text.replace("*", "").replace("#", "").replace("`", "") print(f"[Jarvis] Generating high-quality audio...") try: # Generate speech using gTTS tts = gTTS(text=clean_text, lang='en') # Save to a memory-based byte stream instead of a file fp = io.BytesIO() tts.write_to_fp(fp) fp.seek(0) # Play using pygame pygame.mixer.music.load(fp) pygame.mixer.music.play() # Wait until playback is finished while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(10) except Exception as e: print(f"Error in TTS: {e}") # Fallback to 'say' command if gTTS fails (e.g. offline) subprocess.run(["say", clean_text]) def run_gemini(command): """Call the gemini CLI, capture output, and speak it.""" global current_session_id args = ["gemini", "--prompt", command, "--yolo"] if current_session_id: args.extend(["--resume", current_session_id]) print(f"\n[Jarvis] Continuing session {current_session_id}...") else: print(f"\n[Jarvis] Starting new conversation session...") print(f"[Jarvis] Executing: {' '.join(args)}") try: # Capture stdout to speak it, but still let it print to the console process = subprocess.run(args, capture_output=True, text=True) response = process.stdout.strip() if response: print(f"\n[Gemini Response]:\n{response}") speak_text(response) # After the first successful call, capture the session ID if not current_session_id: time.sleep(1) current_session_id = get_latest_session_id() if current_session_id: print(f"[Jarvis] Session locked: {current_session_id}") except Exception as e: print(f"Error running gemini: {e}") print(f"Listening for '{WAKE_WORD}'...") try: while True: # 1. Listen for Wake Word data = stream.read(CHUNK, exception_on_overflow=False) audio_frame = np.frombuffer(data, dtype=np.int16) prediction = model.predict(audio_frame) if prediction[WAKE_WORD] > SENSITIVITY: print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})") play_sound() # 2. Capture Command print("[Jarvis] Listening for command...") with sr.Microphone() as source: recognizer.adjust_for_ambient_noise(source, duration=0.5) try: audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10) print("[Jarvis] Transcribing...") command = recognizer.recognize_google(audio_cmd) print(f"[Jarvis] You said: {command}") # 3. Execute run_gemini(command) except sr.WaitTimeoutError: print("[Jarvis] No command detected.") except sr.UnknownValueError: print("[Jarvis] Could not understand audio.") except sr.RequestError as e: print(f"[Jarvis] Speech service error: {e}") print(f"\nListening for '{WAKE_WORD}'...") except KeyboardInterrupt: print("\nStopping...") finally: stream.stop_stream() stream.close() audio.terminate()