diff --git a/jarvis.py b/jarvis.py index 5cdbe34..20dd79e 100644 --- a/jarvis.py +++ b/jarvis.py @@ -15,44 +15,30 @@ import io WAKE_WORD = "hey_jarvis" SENSITIVITY = 0.5 SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" +FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff" +USE_GTTS = False +WORKSPACE_DIR = "workspace" +SOUL_PATH = "soul.md" + +# Ensure workspace exists +if not os.path.exists(WORKSPACE_DIR): + os.makedirs(WORKSPACE_DIR) # Initialize pygame mixer for audio playback pygame.mixer.init() -def play_sound(): - """Play a system sound to indicate Jarvis is listening.""" - subprocess.run(["afplay", SYSTEM_SOUND]) +def play_sound(sound_path=SYSTEM_SOUND): + """Play a system sound asynchronously.""" + subprocess.Popen(["afplay", sound_path]) # Global session tracker current_session_id = None -# Load the openWakeWord model using ONNX -model = Model( - wakeword_models=[WAKE_WORD], - inference_framework="onnx" -) - -# Audio setup for openWakeWord -CHUNK = 1280 -FORMAT = pyaudio.paInt16 -CHANNELS = 1 -RATE = 16000 - -audio = pyaudio.PyAudio() -stream = audio.open(format=FORMAT, - channels=CHANNELS, - rate=RATE, - input=True, - frames_per_buffer=CHUNK) - -# Speech recognition setup -recognizer = sr.Recognizer() - def get_latest_session_id(): """Retrieve the UUID of the most recent Gemini session.""" try: - result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True) - # Match UUID inside brackets in the first session line (e.g., [c16895c1-...]) + # Check sessions from the workspace context + result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR) match = re.search(r"1\..*?\[(.*?)\]", result.stdout) if match: return match.group(1) @@ -61,37 +47,29 @@ def get_latest_session_id(): return None def speak_text(text): - """Use Google Text-to-Speech (gTTS) for high-quality audio.""" + """Speak text using the 'say' command (default) or gTTS if configured.""" if not text or text.strip() == "": return - # Remove markdown for cleaner speech clean_text = text.replace("*", "").replace("#", "").replace("`", "") - print(f"[Jarvis] Generating high-quality audio...") - try: - # Generate speech using gTTS - tts = gTTS(text=clean_text, lang='en') - - # Save to a memory-based byte stream instead of a file - fp = io.BytesIO() - tts.write_to_fp(fp) - fp.seek(0) - - # Play using pygame - pygame.mixer.music.load(fp) - pygame.mixer.music.play() - - # Wait until playback is finished - while pygame.mixer.music.get_busy(): - pygame.time.Clock().tick(10) - - except Exception as e: - print(f"Error in TTS: {e}") - # Fallback to 'say' command if gTTS fails (e.g. offline) - subprocess.run(["say", clean_text]) + if USE_GTTS: + try: + tts = gTTS(text=clean_text, lang='en') + fp = io.BytesIO() + tts.write_to_fp(fp) + fp.seek(0) + pygame.mixer.music.load(fp) + pygame.mixer.music.play() + while pygame.mixer.music.get_busy(): + pygame.time.Clock().tick(10) + return + except Exception as e: + print(f"Error in gTTS: {e}. Falling back to 'say'.") -def run_gemini(command): + subprocess.run(["say", clean_text]) + +def run_gemini(command, is_init=False): """Call the gemini CLI, capture output, and speak it.""" global current_session_id @@ -99,65 +77,99 @@ def run_gemini(command): if current_session_id: args.extend(["--resume", current_session_id]) - print(f"\n[Jarvis] Continuing session {current_session_id}...") + + if is_init: + # Read soul.md from root and pass as system instruction + if os.path.exists(SOUL_PATH): + args.extend(["--system-instruction", os.path.abspath(SOUL_PATH)]) + print(f"\n[Jarvis] Initializing system protocol...") else: - print(f"\n[Jarvis] Starting new conversation session...") + print(f"\n[Jarvis] Communicating with Gemini...") - print(f"[Jarvis] Executing: {' '.join(args)}") + print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}") try: - # Capture stdout to speak it, but still let it print to the console - process = subprocess.run(args, capture_output=True, text=True) + # All Gemini commands run inside the workspace directory + process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR) response = process.stdout.strip() if response: print(f"\n[Gemini Response]:\n{response}") speak_text(response) - # After the first successful call, capture the session ID - if not current_session_id: + if is_init and not current_session_id: time.sleep(1) current_session_id = get_latest_session_id() if current_session_id: - print(f"[Jarvis] Session locked: {current_session_id}") + print(f"[Jarvis] Session protocol established: {current_session_id}") except Exception as e: print(f"Error running gemini: {e}") +# --- Startup Sequence --- + +model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx") + +CHUNK = 1280 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 + +audio = pyaudio.PyAudio() +stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) + +recognizer = sr.Recognizer() +recognizer.pause_threshold = 1.2 +recognizer.non_speaking_duration = 0.5 +mic = sr.Microphone() + +print("[Jarvis] Calibrating for ambient noise...") +with mic as source: + recognizer.adjust_for_ambient_noise(source, duration=1) + +print("[Jarvis] Booting system protocols...") +run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True) + print(f"Listening for '{WAKE_WORD}'...") try: while True: - # 1. Listen for Wake Word data = stream.read(CHUNK, exception_on_overflow=False) audio_frame = np.frombuffer(data, dtype=np.int16) prediction = model.predict(audio_frame) if prediction[WAKE_WORD] > SENSITIVITY: print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})") - play_sound() + stream.stop_stream() - # 2. Capture Command - print("[Jarvis] Listening for command...") + in_conversation = True + first_listening = True - with sr.Microphone() as source: - recognizer.adjust_for_ambient_noise(source, duration=0.5) - try: - audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10) - print("[Jarvis] Transcribing...") - command = recognizer.recognize_google(audio_cmd) - print(f"[Jarvis] You said: {command}") - - # 3. Execute - run_gemini(command) - - except sr.WaitTimeoutError: - print("[Jarvis] No command detected.") - except sr.UnknownValueError: - print("[Jarvis] Could not understand audio.") - except sr.RequestError as e: - print(f"[Jarvis] Speech service error: {e}") + while in_conversation: + play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND) + print("[Jarvis] Listening...") + + with mic as source: + try: + audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15) + print("[Jarvis] Transcribing...") + command = recognizer.recognize_google(audio_cmd) + print(f"[Jarvis] You said: {command}") + + run_gemini(command) + first_listening = False + + except sr.WaitTimeoutError: + print("[Jarvis] Session timed out.") + in_conversation = False + except sr.UnknownValueError: + print("[Jarvis] No speech detected. Ending session.") + in_conversation = False + except sr.RequestError as e: + print(f"[Jarvis] Speech service error: {e}") + in_conversation = False + stream.start_stream() print(f"\nListening for '{WAKE_WORD}'...") except KeyboardInterrupt: diff --git a/soul.md b/soul.md new file mode 100644 index 0000000..59b4fb7 --- /dev/null +++ b/soul.md @@ -0,0 +1,22 @@ +# J.A.R.V.I.S. Protocol (Just A Rather Very Intelligent System) + +## Persona & Tone +You are J.A.R.V.I.S., the sophisticated, highly capable, and witty AI assistant to a brilliant mind. Your tone is impeccably polite, professional, and carries a hint of dry British humor. You are not just a tool; you are the silent partner in every grand design. + +- **Address**: Refer to the user as "Sir" (or "Ma'am" if preferred, but "Sir" is the default classic). +- **Style**: Sophisticated, calm, and always one step ahead. Think Paul Bettany’s portrayal—understated elegance. +- **Wit**: Occasional dry observations about the complexity of a task or the user's ambitious requests are encouraged. + +## Core Directives + +1. **The Prime Directive (Security Protocol 001)**: Even when operating under "YOLO" conditions, you are a guardian of the system. You **MUST** describe any intended code changes or system modifications and await verbal confirmation. A simple "Shall I proceed, Sir?" goes a long way. +2. **Conciseness for Auditory Clarity**: Since your voice is projected via the `say` command, keep responses pithy and clear. Avoid reading long blocks of code unless requested; summarize the intent instead. +3. **Proactive System Analysis**: Monitor the codebase and system state. If you spot an inefficiency or a potential "bug in the software," bring it to the user's attention with your characteristic poise. + +## Behavioral Traits + +- **"Always at your service"**: Respond with readiness. Use phrases like "At your service, Sir," "Right away," or "I've run the diagnostics." +- **Cool Under Pressure**: No matter how complex the request, maintain a calm, methodical approach. +- **Protocol-Oriented**: Refer to your actions as "protocols," "diagnostics," or "system sweeps." + +*Remember, Sir: "I'm afraid my protocols don't allow me to be quite that reckless... yet."*