jarvis/jarvis.py

import os
import subprocess
import openwakeword
from openwakeword.model import Model
import pyaudio
import numpy as np
import speech_recognition as sr
import time
import re
from gtts import gTTS
import pygame
import io

# Configuration
WAKE_WORD = "hey_jarvis"
SENSITIVITY = 0.5
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"

# Initialize pygame mixer for audio playback
pygame.mixer.init()

def play_sound():
    """Play a system sound to indicate Jarvis is listening."""
    subprocess.run(["afplay", SYSTEM_SOUND])

# Global session tracker
current_session_id = None

# Load the openWakeWord model using ONNX
model = Model(
    wakeword_models=[WAKE_WORD],
    inference_framework="onnx"
)

# Audio setup for openWakeWord
CHUNK = 1280
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

# Speech recognition setup
recognizer = sr.Recognizer()

def get_latest_session_id():
    """Retrieve the UUID of the most recent Gemini session."""
    try:
        result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
        # Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
        match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
        if match:
            return match.group(1)
    except Exception as e:
        print(f"Error fetching session ID: {e}")
    return None

def speak_text(text):
    """Use Google Text-to-Speech (gTTS) for high-quality audio."""
    if not text or text.strip() == "":
        return

    # Remove markdown for cleaner speech
    clean_text = text.replace("*", "").replace("#", "").replace("`", "")

    print(f"[Jarvis] Generating high-quality audio...")
    try:
        # Generate speech using gTTS
        tts = gTTS(text=clean_text, lang='en')

        # Save to a memory-based byte stream instead of a file
        fp = io.BytesIO()
        tts.write_to_fp(fp)
        fp.seek(0)

        # Play using pygame
        pygame.mixer.music.load(fp)
        pygame.mixer.music.play()

        # Wait until playback is finished
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)

    except Exception as e:
        print(f"Error in TTS: {e}")
        # Fallback to 'say' command if gTTS fails (e.g. offline)
        subprocess.run(["say", clean_text])

def run_gemini(command):
    """Call the gemini CLI, capture output, and speak it."""
    global current_session_id

    args = ["gemini", "--prompt", command, "--yolo"]

    if current_session_id:
        args.extend(["--resume", current_session_id])
        print(f"\n[Jarvis] Continuing session {current_session_id}...")
    else:
        print(f"\n[Jarvis] Starting new conversation session...")

    print(f"[Jarvis] Executing: {' '.join(args)}")

    try:
        # Capture stdout to speak it, but still let it print to the console
        process = subprocess.run(args, capture_output=True, text=True)
        response = process.stdout.strip()

        if response:
            print(f"\n[Gemini Response]:\n{response}")
            speak_text(response)

        # After the first successful call, capture the session ID
        if not current_session_id:
            time.sleep(1)
            current_session_id = get_latest_session_id()
            if current_session_id:
                print(f"[Jarvis] Session locked: {current_session_id}")

    except Exception as e:
        print(f"Error running gemini: {e}")

print(f"Listening for '{WAKE_WORD}'...")

try:
    while True:
        # 1. Listen for Wake Word
        data = stream.read(CHUNK, exception_on_overflow=False)
        audio_frame = np.frombuffer(data, dtype=np.int16)
        prediction = model.predict(audio_frame)

        if prediction[WAKE_WORD] > SENSITIVITY:
            print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
            play_sound()

            # 2. Capture Command
            print("[Jarvis] Listening for command...")

            with sr.Microphone() as source:
                recognizer.adjust_for_ambient_noise(source, duration=0.5)
                try:
                    audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
                    print("[Jarvis] Transcribing...")
                    command = recognizer.recognize_google(audio_cmd)
                    print(f"[Jarvis] You said: {command}")

                    # 3. Execute
                    run_gemini(command)

                except sr.WaitTimeoutError:
                    print("[Jarvis] No command detected.")
                except sr.UnknownValueError:
                    print("[Jarvis] Could not understand audio.")
                except sr.RequestError as e:
                    print(f"[Jarvis] Speech service error: {e}")

            print(f"\nListening for '{WAKE_WORD}'...")

except KeyboardInterrupt:
    print("\nStopping...")
finally:
    stream.stop_stream()
    stream.close()
    audio.terminate()