Initial commit: Jarvis AI Assistant with openWakeWord and gTTS

2026-02-26 11:22:00 -05:00
commit 5ee5aec3a3
4 changed files with 223 additions and 0 deletions
@@ -0,0 +1,6 @@
 venv/
 __pycache__/
 *.pyc
 .DS_Store
 *.wav
 *.mp3
@@ -0,0 +1,43 @@
 # Jarvis Voice Assistant (openWakeWord + Gemini CLI)
 This project uses `openWakeWord` to listen for the "Hey Jarvis" wake word and then uses `SpeechRecognition` to capture a command and send it to the `gemini` CLI.
 ## Setup
 1. **Install PortAudio** (macOS):
   ```bash
   brew install portaudio
   ```
 2. **Create and Activate Virtual Environment**:
   ```bash
   python3 -m venv venv
   source venv/bin/activate
   ```
 3. **Install Dependencies**:
   ```bash
   pip install -r requirements.txt
   ```
 4. **Verify Gemini CLI**:
   Ensure `gemini` is installed and available in your PATH.
 ## Usage
 Run the script:
 ```bash
 source venv/bin/activate
 python jarvis.py
 ```
 1. Say "**Hey Jarvis**".
 2. You will hear a "tink" sound.
 3. Speak your command (e.g., "List the files in this directory" or "Check the weather").
 4. The script will transcribe your command and run `gemini "<your command>"`.
 ## How it Works
 - **openWakeWord**: Provides local, low-latency wake word detection.
 - **SpeechRecognition**: Uses Google's Web Speech API for transcription.
 - **Gemini CLI**: The brain that processes the commands and calls agent tools.
@@ -0,0 +1,168 @@
 import os
 import subprocess
 import openwakeword
 from openwakeword.model import Model
 import pyaudio
 import numpy as np
 import speech_recognition as sr
 import time
 import re
 from gtts import gTTS
 import pygame
 import io
 # Configuration
 WAKE_WORD = "hey_jarvis"
 SENSITIVITY = 0.5
 SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
 # Initialize pygame mixer for audio playback
 pygame.mixer.init()
 def play_sound():
    """Play a system sound to indicate Jarvis is listening."""
    subprocess.run(["afplay", SYSTEM_SOUND])
 # Global session tracker
 current_session_id = None
 # Load the openWakeWord model using ONNX
 model = Model(
    wakeword_models=[WAKE_WORD],
    inference_framework="onnx"
 )
 # Audio setup for openWakeWord
 CHUNK = 1280
 FORMAT = pyaudio.paInt16
 CHANNELS = 1
 RATE = 16000
 audio = pyaudio.PyAudio()
 stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
 # Speech recognition setup
 recognizer = sr.Recognizer()
 def get_latest_session_id():
    """Retrieve the UUID of the most recent Gemini session."""
    try:
        result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
        # Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
        match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
        if match:
            return match.group(1)
    except Exception as e:
        print(f"Error fetching session ID: {e}")
    return None
 def speak_text(text):
    """Use Google Text-to-Speech (gTTS) for high-quality audio."""
    if not text or text.strip() == "":
        return
    # Remove markdown for cleaner speech
    clean_text = text.replace("*", "").replace("#", "").replace("`", "")
    print(f"[Jarvis] Generating high-quality audio...")
    try:
        # Generate speech using gTTS
        tts = gTTS(text=clean_text, lang='en')
        # Save to a memory-based byte stream instead of a file
        fp = io.BytesIO()
        tts.write_to_fp(fp)
        fp.seek(0)
        # Play using pygame
        pygame.mixer.music.load(fp)
        pygame.mixer.music.play()
        # Wait until playback is finished
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)
    except Exception as e:
        print(f"Error in TTS: {e}")
        # Fallback to 'say' command if gTTS fails (e.g. offline)
        subprocess.run(["say", clean_text])
 def run_gemini(command):
    """Call the gemini CLI, capture output, and speak it."""
    global current_session_id
    args = ["gemini", "--prompt", command, "--yolo"]
    if current_session_id:
        args.extend(["--resume", current_session_id])
        print(f"\n[Jarvis] Continuing session {current_session_id}...")
    else:
        print(f"\n[Jarvis] Starting new conversation session...")
    print(f"[Jarvis] Executing: {' '.join(args)}")
    try:
        # Capture stdout to speak it, but still let it print to the console
        process = subprocess.run(args, capture_output=True, text=True)
        response = process.stdout.strip()
        if response:
            print(f"\n[Gemini Response]:\n{response}")
            speak_text(response)
        # After the first successful call, capture the session ID
        if not current_session_id:
            time.sleep(1)
            current_session_id = get_latest_session_id()
            if current_session_id:
                print(f"[Jarvis] Session locked: {current_session_id}")
    except Exception as e:
        print(f"Error running gemini: {e}")
 print(f"Listening for '{WAKE_WORD}'...")
 try:
    while True:
        # 1. Listen for Wake Word
        data = stream.read(CHUNK, exception_on_overflow=False)
        audio_frame = np.frombuffer(data, dtype=np.int16)
        prediction = model.predict(audio_frame)
        if prediction[WAKE_WORD] > SENSITIVITY:
            print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
            play_sound()
            # 2. Capture Command
            print("[Jarvis] Listening for command...")
            with sr.Microphone() as source:
                recognizer.adjust_for_ambient_noise(source, duration=0.5)
                try:
                    audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
                    print("[Jarvis] Transcribing...")
                    command = recognizer.recognize_google(audio_cmd)
                    print(f"[Jarvis] You said: {command}")
                    # 3. Execute
                    run_gemini(command)
                except sr.WaitTimeoutError:
                    print("[Jarvis] No command detected.")
                except sr.UnknownValueError:
                    print("[Jarvis] Could not understand audio.")
                except sr.RequestError as e:
                    print(f"[Jarvis] Speech service error: {e}")
            print(f"\nListening for '{WAKE_WORD}'...")
 except KeyboardInterrupt:
    print("\nStopping...")
 finally:
    stream.stop_stream()
    stream.close()
    audio.terminate()
@@ -0,0 +1,6 @@
 openwakeword
 pyaudio
 requests
 SpeechRecognition
 numpy
 onnxruntime