Files
jarvis/jarvis.py

202 lines
6.7 KiB
Python

import os
import subprocess
import openwakeword
from openwakeword.model import Model
import pyaudio
import numpy as np
import speech_recognition as sr
import time
import re
from gtts import gTTS
import pygame
import io
# Configuration
WAKE_WORD = "hey_jarvis"
SENSITIVITY = 0.5
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
USE_GTTS = False
WORKSPACE_DIR = "workspace"
SOUL_PATH = "soul.md"
# Ensure workspace exists
if not os.path.exists(WORKSPACE_DIR):
os.makedirs(WORKSPACE_DIR)
# Initialize pygame mixer for audio playback
pygame.mixer.init()
def play_sound(sound_path=SYSTEM_SOUND):
"""Play a system sound asynchronously."""
subprocess.Popen(["afplay", sound_path])
# Global session tracker
current_session_id = None
def get_latest_session_id():
"""Retrieve the UUID of the most recent Gemini session."""
try:
# Check sessions from the workspace context
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
# Find all UUIDs inside brackets [UUID]
matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout)
if matches:
# Return the last one in the list
return matches[-1]
except Exception as e:
print(f"Error fetching session ID: {e}")
return None
def speak_text(text):
"""Speak text using the 'say' command (default) or gTTS if configured."""
if not text or text.strip() == "":
return
clean_text = text.replace("*", "").replace("#", "").replace("`", "")
if USE_GTTS:
try:
tts = gTTS(text=clean_text, lang='en')
fp = io.BytesIO()
tts.write_to_fp(fp)
fp.seek(0)
pygame.mixer.music.load(fp)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
return
except Exception as e:
print(f"Error in gTTS: {e}. Falling back to 'say'.")
subprocess.run(["say", clean_text])
def run_gemini(command, is_init=False):
"""Call the gemini CLI, capture output, and speak it."""
global current_session_id
args = ["gemini", "--prompt", command, "--yolo"]
if current_session_id:
args.extend(["--resume", current_session_id])
# Set up environment for Gemini CLI
env = os.environ.copy()
if os.path.exists(SOUL_PATH):
with open(SOUL_PATH, 'r') as f:
soul_content = f.read()
# Inject date/time context only on initialization
if is_init:
current_time = time.strftime("%A, %B %d, %Y, %I:%M %p")
soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content
print(f"\n[Jarvis] Initializing system protocol with temporal context...")
else:
print(f"\n[Jarvis] Communicating with Gemini...")
# Use a temporary file for the system instruction
system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md"))
with open(system_md_path, 'w') as f:
f.write(soul_content)
env["GEMINI_SYSTEM_MD"] = system_md_path
else:
if is_init:
print(f"\n[Jarvis] Initializing system protocol...")
else:
print(f"\n[Jarvis] Communicating with Gemini...")
print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")
try:
# All Gemini commands run inside the workspace directory
process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env)
response = process.stdout.strip()
if response:
print(f"\n[Gemini Response]:\n{response}")
speak_text(response)
if is_init and not current_session_id:
time.sleep(1)
current_session_id = get_latest_session_id()
if current_session_id:
print(f"[Jarvis] Session protocol established: {current_session_id}")
except Exception as e:
print(f"Error running gemini: {e}")
# --- Startup Sequence ---
model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx")
CHUNK = 1280
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
recognizer = sr.Recognizer()
recognizer.pause_threshold = 1.2
recognizer.non_speaking_duration = 0.5
mic = sr.Microphone()
print("[Jarvis] Calibrating for ambient noise...")
with mic as source:
recognizer.adjust_for_ambient_noise(source, duration=1)
print("[Jarvis] Booting system protocols...")
current_time = time.strftime("%A, %B %d, %Y, %I:%M %p")
run_gemini(f"System initialization complete. The current date and time is {current_time}. Awaiting orders, Sir.", is_init=True)
print(f"Listening for '{WAKE_WORD}'...")
try:
while True:
data = stream.read(CHUNK, exception_on_overflow=False)
audio_frame = np.frombuffer(data, dtype=np.int16)
prediction = model.predict(audio_frame)
if prediction[WAKE_WORD] > SENSITIVITY:
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
stream.stop_stream()
in_conversation = True
first_listening = True
while in_conversation:
play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
print("[Jarvis] Listening...")
with mic as source:
try:
audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
print("[Jarvis] Transcribing...")
command = recognizer.recognize_google(audio_cmd)
print(f"[Jarvis] You said: {command}")
run_gemini(command)
first_listening = False
except sr.WaitTimeoutError:
print("[Jarvis] Session timed out.")
in_conversation = False
except sr.UnknownValueError:
print("[Jarvis] No speech detected. Ending session.")
in_conversation = False
except sr.RequestError as e:
print(f"[Jarvis] Speech service error: {e}")
in_conversation = False
stream.start_stream()
print(f"\nListening for '{WAKE_WORD}'...")
except KeyboardInterrupt:
print("\nStopping...")
finally:
stream.stop_stream()
stream.close()
audio.terminate()