import os import subprocess import openwakeword from openwakeword.model import Model import pyaudio import numpy as np import speech_recognition as sr import time import re from gtts import gTTS import pygame import io # Configuration WAKE_WORD = "hey_jarvis" SENSITIVITY = 0.5 SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff" USE_GTTS = False WORKSPACE_DIR = "workspace" SOUL_PATH = "soul.md" # Ensure workspace exists if not os.path.exists(WORKSPACE_DIR): os.makedirs(WORKSPACE_DIR) # Initialize pygame mixer for audio playback pygame.mixer.init() def play_sound(sound_path=SYSTEM_SOUND): """Play a system sound asynchronously.""" subprocess.Popen(["afplay", sound_path]) # Global session tracker current_session_id = None def get_latest_session_id(): """Retrieve the UUID of the most recent Gemini session.""" try: # Check sessions from the workspace context result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR) # Find all UUIDs inside brackets [UUID] matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout) if matches: # Return the last one in the list return matches[-1] except Exception as e: print(f"Error fetching session ID: {e}") return None def speak_text(text): """Speak text using the 'say' command (default) or gTTS if configured.""" if not text or text.strip() == "": return clean_text = text.replace("*", "").replace("#", "").replace("`", "") if USE_GTTS: try: tts = gTTS(text=clean_text, lang='en') fp = io.BytesIO() tts.write_to_fp(fp) fp.seek(0) pygame.mixer.music.load(fp) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(10) return except Exception as e: print(f"Error in gTTS: {e}. Falling back to 'say'.") subprocess.run(["say", clean_text]) def run_gemini(command, is_init=False): """Call the gemini CLI, capture output, and speak it.""" global current_session_id args = ["gemini", "--prompt", command, "--yolo"] if current_session_id: args.extend(["--resume", current_session_id]) # Set up environment for Gemini CLI env = os.environ.copy() if os.path.exists(SOUL_PATH): with open(SOUL_PATH, 'r') as f: soul_content = f.read() # Inject date/time context only on initialization if is_init: current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content print(f"\n[Jarvis] Initializing system protocol with temporal context...") else: print(f"\n[Jarvis] Communicating with Gemini...") # Use a temporary file for the system instruction system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md")) with open(system_md_path, 'w') as f: f.write(soul_content) env["GEMINI_SYSTEM_MD"] = system_md_path else: if is_init: print(f"\n[Jarvis] Initializing system protocol...") else: print(f"\n[Jarvis] Communicating with Gemini...") print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}") try: # All Gemini commands run inside the workspace directory process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env) response = process.stdout.strip() if response: print(f"\n[Gemini Response]:\n{response}") speak_text(response) if is_init and not current_session_id: time.sleep(1) current_session_id = get_latest_session_id() if current_session_id: print(f"[Jarvis] Session protocol established: {current_session_id}") except Exception as e: print(f"Error running gemini: {e}") # --- Startup Sequence --- model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx") CHUNK = 1280 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) recognizer = sr.Recognizer() recognizer.pause_threshold = 1.2 recognizer.non_speaking_duration = 0.5 mic = sr.Microphone() print("[Jarvis] Calibrating for ambient noise...") with mic as source: recognizer.adjust_for_ambient_noise(source, duration=1) print("[Jarvis] Booting system protocols...") current_time = time.strftime("%A, %B %d, %Y, %I:%M %p") run_gemini(f"System initialization complete. The current date and time is {current_time}. Awaiting orders, Sir.", is_init=True) print(f"Listening for '{WAKE_WORD}'...") try: while True: data = stream.read(CHUNK, exception_on_overflow=False) audio_frame = np.frombuffer(data, dtype=np.int16) prediction = model.predict(audio_frame) if prediction[WAKE_WORD] > SENSITIVITY: print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})") stream.stop_stream() in_conversation = True first_listening = True while in_conversation: play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND) print("[Jarvis] Listening...") with mic as source: try: audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15) print("[Jarvis] Transcribing...") command = recognizer.recognize_google(audio_cmd) print(f"[Jarvis] You said: {command}") run_gemini(command) first_listening = False except sr.WaitTimeoutError: print("[Jarvis] Session timed out.") in_conversation = False except sr.UnknownValueError: print("[Jarvis] No speech detected. Ending session.") in_conversation = False except sr.RequestError as e: print(f"[Jarvis] Speech service error: {e}") in_conversation = False stream.start_stream() print(f"\nListening for '{WAKE_WORD}'...") except KeyboardInterrupt: print("\nStopping...") finally: stream.stop_stream() stream.close() audio.terminate()