feat: integrate J.A.R.V.I.S. personality and continuous conversation protocol

This commit is contained in:
Adolfo Reyna
2026-02-26 11:53:54 -05:00
parent 5ee5aec3a3
commit 6984e36f3b
2 changed files with 115 additions and 81 deletions

174
jarvis.py
View File

@@ -15,44 +15,30 @@ import io
WAKE_WORD = "hey_jarvis"
SENSITIVITY = 0.5
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
USE_GTTS = False
WORKSPACE_DIR = "workspace"
SOUL_PATH = "soul.md"
# Ensure workspace exists
if not os.path.exists(WORKSPACE_DIR):
os.makedirs(WORKSPACE_DIR)
# Initialize pygame mixer for audio playback
pygame.mixer.init()
def play_sound():
"""Play a system sound to indicate Jarvis is listening."""
subprocess.run(["afplay", SYSTEM_SOUND])
def play_sound(sound_path=SYSTEM_SOUND):
"""Play a system sound asynchronously."""
subprocess.Popen(["afplay", sound_path])
# Global session tracker
current_session_id = None
# Load the openWakeWord model using ONNX
model = Model(
wakeword_models=[WAKE_WORD],
inference_framework="onnx"
)
# Audio setup for openWakeWord
CHUNK = 1280
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
# Speech recognition setup
recognizer = sr.Recognizer()
def get_latest_session_id():
"""Retrieve the UUID of the most recent Gemini session."""
try:
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
# Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
# Check sessions from the workspace context
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
if match:
return match.group(1)
@@ -61,37 +47,29 @@ def get_latest_session_id():
return None
def speak_text(text):
"""Use Google Text-to-Speech (gTTS) for high-quality audio."""
"""Speak text using the 'say' command (default) or gTTS if configured."""
if not text or text.strip() == "":
return
# Remove markdown for cleaner speech
clean_text = text.replace("*", "").replace("#", "").replace("`", "")
print(f"[Jarvis] Generating high-quality audio...")
try:
# Generate speech using gTTS
tts = gTTS(text=clean_text, lang='en')
# Save to a memory-based byte stream instead of a file
fp = io.BytesIO()
tts.write_to_fp(fp)
fp.seek(0)
# Play using pygame
pygame.mixer.music.load(fp)
pygame.mixer.music.play()
# Wait until playback is finished
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
except Exception as e:
print(f"Error in TTS: {e}")
# Fallback to 'say' command if gTTS fails (e.g. offline)
subprocess.run(["say", clean_text])
if USE_GTTS:
try:
tts = gTTS(text=clean_text, lang='en')
fp = io.BytesIO()
tts.write_to_fp(fp)
fp.seek(0)
pygame.mixer.music.load(fp)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
return
except Exception as e:
print(f"Error in gTTS: {e}. Falling back to 'say'.")
def run_gemini(command):
subprocess.run(["say", clean_text])
def run_gemini(command, is_init=False):
"""Call the gemini CLI, capture output, and speak it."""
global current_session_id
@@ -99,65 +77,99 @@ def run_gemini(command):
if current_session_id:
args.extend(["--resume", current_session_id])
print(f"\n[Jarvis] Continuing session {current_session_id}...")
if is_init:
# Read soul.md from root and pass as system instruction
if os.path.exists(SOUL_PATH):
args.extend(["--system-instruction", os.path.abspath(SOUL_PATH)])
print(f"\n[Jarvis] Initializing system protocol...")
else:
print(f"\n[Jarvis] Starting new conversation session...")
print(f"\n[Jarvis] Communicating with Gemini...")
print(f"[Jarvis] Executing: {' '.join(args)}")
print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")
try:
# Capture stdout to speak it, but still let it print to the console
process = subprocess.run(args, capture_output=True, text=True)
# All Gemini commands run inside the workspace directory
process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR)
response = process.stdout.strip()
if response:
print(f"\n[Gemini Response]:\n{response}")
speak_text(response)
# After the first successful call, capture the session ID
if not current_session_id:
if is_init and not current_session_id:
time.sleep(1)
current_session_id = get_latest_session_id()
if current_session_id:
print(f"[Jarvis] Session locked: {current_session_id}")
print(f"[Jarvis] Session protocol established: {current_session_id}")
except Exception as e:
print(f"Error running gemini: {e}")
# --- Startup Sequence ---
model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx")
CHUNK = 1280
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
recognizer = sr.Recognizer()
recognizer.pause_threshold = 1.2
recognizer.non_speaking_duration = 0.5
mic = sr.Microphone()
print("[Jarvis] Calibrating for ambient noise...")
with mic as source:
recognizer.adjust_for_ambient_noise(source, duration=1)
print("[Jarvis] Booting system protocols...")
run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True)
print(f"Listening for '{WAKE_WORD}'...")
try:
while True:
# 1. Listen for Wake Word
data = stream.read(CHUNK, exception_on_overflow=False)
audio_frame = np.frombuffer(data, dtype=np.int16)
prediction = model.predict(audio_frame)
if prediction[WAKE_WORD] > SENSITIVITY:
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
play_sound()
stream.stop_stream()
# 2. Capture Command
print("[Jarvis] Listening for command...")
in_conversation = True
first_listening = True
with sr.Microphone() as source:
recognizer.adjust_for_ambient_noise(source, duration=0.5)
try:
audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
print("[Jarvis] Transcribing...")
command = recognizer.recognize_google(audio_cmd)
print(f"[Jarvis] You said: {command}")
# 3. Execute
run_gemini(command)
except sr.WaitTimeoutError:
print("[Jarvis] No command detected.")
except sr.UnknownValueError:
print("[Jarvis] Could not understand audio.")
except sr.RequestError as e:
print(f"[Jarvis] Speech service error: {e}")
while in_conversation:
play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
print("[Jarvis] Listening...")
with mic as source:
try:
audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
print("[Jarvis] Transcribing...")
command = recognizer.recognize_google(audio_cmd)
print(f"[Jarvis] You said: {command}")
run_gemini(command)
first_listening = False
except sr.WaitTimeoutError:
print("[Jarvis] Session timed out.")
in_conversation = False
except sr.UnknownValueError:
print("[Jarvis] No speech detected. Ending session.")
in_conversation = False
except sr.RequestError as e:
print(f"[Jarvis] Speech service error: {e}")
in_conversation = False
stream.start_stream()
print(f"\nListening for '{WAKE_WORD}'...")
except KeyboardInterrupt: