Initial commit: Jarvis AI Assistant with openWakeWord and gTTS

2026-02-26 11:22:00 -05:00
commit 5ee5aec3a3
4 changed files with 223 additions and 0 deletions
@@ -0,0 +1,6 @@
+venv/
+__pycache__/
+*.pyc
+.DS_Store
+*.wav
+*.mp3
@@ -0,0 +1,43 @@
+# Jarvis Voice Assistant (openWakeWord + Gemini CLI)
+
+This project uses `openWakeWord` to listen for the "Hey Jarvis" wake word and then uses `SpeechRecognition` to capture a command and send it to the `gemini` CLI.
+
+## Setup
+
+1. **Install PortAudio** (macOS):
+   ```bash
+   brew install portaudio
+   ```
+
+2. **Create and Activate Virtual Environment**:
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate
+   ```
+
+3. **Install Dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. **Verify Gemini CLI**:
+   Ensure `gemini` is installed and available in your PATH.
+
+## Usage
+
+Run the script:
+```bash
+source venv/bin/activate
+python jarvis.py
+```
+
+1. Say "**Hey Jarvis**".
+2. You will hear a "tink" sound.
+3. Speak your command (e.g., "List the files in this directory" or "Check the weather").
+4. The script will transcribe your command and run `gemini "<your command>"`.
+
+## How it Works
+
+- **openWakeWord**: Provides local, low-latency wake word detection.
+- **SpeechRecognition**: Uses Google's Web Speech API for transcription.
+- **Gemini CLI**: The brain that processes the commands and calls agent tools.
@@ -0,0 +1,168 @@
+import os
+import subprocess
+import openwakeword
+from openwakeword.model import Model
+import pyaudio
+import numpy as np
+import speech_recognition as sr
+import time
+import re
+from gtts import gTTS
+import pygame
+import io
+
+# Configuration
+WAKE_WORD = "hey_jarvis"
+SENSITIVITY = 0.5
+SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
+
+# Initialize pygame mixer for audio playback
+pygame.mixer.init()
+
+def play_sound():
+    """Play a system sound to indicate Jarvis is listening."""
+    subprocess.run(["afplay", SYSTEM_SOUND])
+
+# Global session tracker
+current_session_id = None
+
+# Load the openWakeWord model using ONNX
+model = Model(
+    wakeword_models=[WAKE_WORD],
+    inference_framework="onnx"
+)
+
+# Audio setup for openWakeWord
+CHUNK = 1280
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+
+audio = pyaudio.PyAudio()
+stream = audio.open(format=FORMAT,
+                    channels=CHANNELS,
+                    rate=RATE,
+                    input=True,
+                    frames_per_buffer=CHUNK)
+
+# Speech recognition setup
+recognizer = sr.Recognizer()
+
+def get_latest_session_id():
+    """Retrieve the UUID of the most recent Gemini session."""
+    try:
+        result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
+        # Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
+        match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
+        if match:
+            return match.group(1)
+    except Exception as e:
+        print(f"Error fetching session ID: {e}")
+    return None
+
+def speak_text(text):
+    """Use Google Text-to-Speech (gTTS) for high-quality audio."""
+    if not text or text.strip() == "":
+        return
+    
+    # Remove markdown for cleaner speech
+    clean_text = text.replace("*", "").replace("#", "").replace("`", "")
+    
+    print(f"[Jarvis] Generating high-quality audio...")
+    try:
+        # Generate speech using gTTS
+        tts = gTTS(text=clean_text, lang='en')
+        
+        # Save to a memory-based byte stream instead of a file
+        fp = io.BytesIO()
+        tts.write_to_fp(fp)
+        fp.seek(0)
+        
+        # Play using pygame
+        pygame.mixer.music.load(fp)
+        pygame.mixer.music.play()
+        
+        # Wait until playback is finished
+        while pygame.mixer.music.get_busy():
+            pygame.time.Clock().tick(10)
+            
+    except Exception as e:
+        print(f"Error in TTS: {e}")
+        # Fallback to 'say' command if gTTS fails (e.g. offline)
+        subprocess.run(["say", clean_text])
+
+def run_gemini(command):
+    """Call the gemini CLI, capture output, and speak it."""
+    global current_session_id
+    
+    args = ["gemini", "--prompt", command, "--yolo"]
+    
+    if current_session_id:
+        args.extend(["--resume", current_session_id])
+        print(f"\n[Jarvis] Continuing session {current_session_id}...")
+    else:
+        print(f"\n[Jarvis] Starting new conversation session...")
+
+    print(f"[Jarvis] Executing: {' '.join(args)}")
+    
+    try:
+        # Capture stdout to speak it, but still let it print to the console
+        process = subprocess.run(args, capture_output=True, text=True)
+        response = process.stdout.strip()
+        
+        if response:
+            print(f"\n[Gemini Response]:\n{response}")
+            speak_text(response)
+        
+        # After the first successful call, capture the session ID
+        if not current_session_id:
+            time.sleep(1)
+            current_session_id = get_latest_session_id()
+            if current_session_id:
+                print(f"[Jarvis] Session locked: {current_session_id}")
+                
+    except Exception as e:
+        print(f"Error running gemini: {e}")
+
+print(f"Listening for '{WAKE_WORD}'...")
+
+try:
+    while True:
+        # 1. Listen for Wake Word
+        data = stream.read(CHUNK, exception_on_overflow=False)
+        audio_frame = np.frombuffer(data, dtype=np.int16)
+        prediction = model.predict(audio_frame)
+
+        if prediction[WAKE_WORD] > SENSITIVITY:
+            print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
+            play_sound()
+            
+            # 2. Capture Command
+            print("[Jarvis] Listening for command...")
+            
+            with sr.Microphone() as source:
+                recognizer.adjust_for_ambient_noise(source, duration=0.5)
+                try:
+                    audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
+                    print("[Jarvis] Transcribing...")
+                    command = recognizer.recognize_google(audio_cmd)
+                    print(f"[Jarvis] You said: {command}")
+                    
+                    # 3. Execute
+                    run_gemini(command)
+                    
+                except sr.WaitTimeoutError:
+                    print("[Jarvis] No command detected.")
+                except sr.UnknownValueError:
+                    print("[Jarvis] Could not understand audio.")
+                except sr.RequestError as e:
+                    print(f"[Jarvis] Speech service error: {e}")
+            
+            print(f"\nListening for '{WAKE_WORD}'...")
+
+except KeyboardInterrupt:
+    print("\nStopping...")
+finally:
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
@@ -0,0 +1,6 @@
+openwakeword
+pyaudio
+requests
+SpeechRecognition
+numpy
+onnxruntime