jarvis/jarvis.py

import os
import subprocess
import openwakeword
from openwakeword.model import Model
import pyaudio
import numpy as np
import speech_recognition as sr
import time
import re
from gtts import gTTS
import pygame
import io

# Configuration
WAKE_WORD = "hey_jarvis"
SENSITIVITY = 0.5
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
USE_GTTS = False
WORKSPACE_DIR = "workspace"
SOUL_PATH = "soul.md"

# Ensure workspace exists
if not os.path.exists(WORKSPACE_DIR):
    os.makedirs(WORKSPACE_DIR)

# Initialize pygame mixer for audio playback
pygame.mixer.init()

def play_sound(sound_path=SYSTEM_SOUND):
    """Play a system sound asynchronously."""
    subprocess.Popen(["afplay", sound_path])

# Global session tracker
current_session_id = None

def get_latest_session_id():
    """Retrieve the UUID of the most recent Gemini session."""
    try:
        # Check sessions from the workspace context
        result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
        match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
        if match:
            return match.group(1)
    except Exception as e:
        print(f"Error fetching session ID: {e}")
    return None

def speak_text(text):
    """Speak text using the 'say' command (default) or gTTS if configured."""
    if not text or text.strip() == "":
        return

    clean_text = text.replace("*", "").replace("#", "").replace("`", "")

    if USE_GTTS:
        try:
            tts = gTTS(text=clean_text, lang='en')
            fp = io.BytesIO()
            tts.write_to_fp(fp)
            fp.seek(0)
            pygame.mixer.music.load(fp)
            pygame.mixer.music.play()
            while pygame.mixer.music.get_busy():
                pygame.time.Clock().tick(10)
            return
        except Exception as e:
            print(f"Error in gTTS: {e}. Falling back to 'say'.")

    subprocess.run(["say", clean_text])

def run_gemini(command, is_init=False):
    """Call the gemini CLI, capture output, and speak it."""
    global current_session_id

    args = ["gemini", "--prompt", command, "--yolo"]

    if current_session_id:
        args.extend(["--resume", current_session_id])

    if is_init:
        # Read soul.md from root and pass as system instruction
        if os.path.exists(SOUL_PATH):
            args.extend(["--system-instruction", os.path.abspath(SOUL_PATH)])
        print(f"\n[Jarvis] Initializing system protocol...")
    else:
        print(f"\n[Jarvis] Communicating with Gemini...")

    print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")

    try:
        # All Gemini commands run inside the workspace directory
        process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR)
        response = process.stdout.strip()

        if response:
            print(f"\n[Gemini Response]:\n{response}")
            speak_text(response)

        if is_init and not current_session_id:
            time.sleep(1)
            current_session_id = get_latest_session_id()
            if current_session_id:
                print(f"[Jarvis] Session protocol established: {current_session_id}")

    except Exception as e:
        print(f"Error running gemini: {e}")

# --- Startup Sequence ---

model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx")

CHUNK = 1280
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

recognizer = sr.Recognizer()
recognizer.pause_threshold = 1.2
recognizer.non_speaking_duration = 0.5
mic = sr.Microphone()

print("[Jarvis] Calibrating for ambient noise...")
with mic as source:
    recognizer.adjust_for_ambient_noise(source, duration=1)

print("[Jarvis] Booting system protocols...")
run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True)

print(f"Listening for '{WAKE_WORD}'...")

try:
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        audio_frame = np.frombuffer(data, dtype=np.int16)
        prediction = model.predict(audio_frame)

        if prediction[WAKE_WORD] > SENSITIVITY:
            print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
            stream.stop_stream()

            in_conversation = True
            first_listening = True

            while in_conversation:
                play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
                print("[Jarvis] Listening...")

                with mic as source:
                    try:
                        audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
                        print("[Jarvis] Transcribing...")
                        command = recognizer.recognize_google(audio_cmd)
                        print(f"[Jarvis] You said: {command}")

                        run_gemini(command)
                        first_listening = False

                    except sr.WaitTimeoutError:
                        print("[Jarvis] Session timed out.")
                        in_conversation = False
                    except sr.UnknownValueError:
                        print("[Jarvis] No speech detected. Ending session.")
                        in_conversation = False
                    except sr.RequestError as e:
                        print(f"[Jarvis] Speech service error: {e}")
                        in_conversation = False

            stream.start_stream()
            print(f"\nListening for '{WAKE_WORD}'...")

except KeyboardInterrupt:
    print("\nStopping...")
finally:
    stream.stop_stream()
    stream.close()
    audio.terminate()