From 5ee5aec3a39d04ada4055c847e77bde57dd8595e Mon Sep 17 00:00:00 2001 From: Adolfo Reyna Date: Thu, 26 Feb 2026 11:22:00 -0500 Subject: [PATCH] Initial commit: Jarvis AI Assistant with openWakeWord and gTTS --- .gitignore | 6 ++ README.md | 43 ++++++++++++ jarvis.py | 168 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 ++ 4 files changed, 223 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 jarvis.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..716bf61 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +venv/ +__pycache__/ +*.pyc +.DS_Store +*.wav +*.mp3 diff --git a/README.md b/README.md new file mode 100644 index 0000000..f9a9b2d --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Jarvis Voice Assistant (openWakeWord + Gemini CLI) + +This project uses `openWakeWord` to listen for the "Hey Jarvis" wake word and then uses `SpeechRecognition` to capture a command and send it to the `gemini` CLI. + +## Setup + +1. **Install PortAudio** (macOS): + ```bash + brew install portaudio + ``` + +2. **Create and Activate Virtual Environment**: + ```bash + python3 -m venv venv + source venv/bin/activate + ``` + +3. **Install Dependencies**: + ```bash + pip install -r requirements.txt + ``` + +4. **Verify Gemini CLI**: + Ensure `gemini` is installed and available in your PATH. + +## Usage + +Run the script: +```bash +source venv/bin/activate +python jarvis.py +``` + +1. Say "**Hey Jarvis**". +2. You will hear a "tink" sound. +3. Speak your command (e.g., "List the files in this directory" or "Check the weather"). +4. The script will transcribe your command and run `gemini ""`. + +## How it Works + +- **openWakeWord**: Provides local, low-latency wake word detection. +- **SpeechRecognition**: Uses Google's Web Speech API for transcription. +- **Gemini CLI**: The brain that processes the commands and calls agent tools. diff --git a/jarvis.py b/jarvis.py new file mode 100644 index 0000000..5cdbe34 --- /dev/null +++ b/jarvis.py @@ -0,0 +1,168 @@ +import os +import subprocess +import openwakeword +from openwakeword.model import Model +import pyaudio +import numpy as np +import speech_recognition as sr +import time +import re +from gtts import gTTS +import pygame +import io + +# Configuration +WAKE_WORD = "hey_jarvis" +SENSITIVITY = 0.5 +SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff" + +# Initialize pygame mixer for audio playback +pygame.mixer.init() + +def play_sound(): + """Play a system sound to indicate Jarvis is listening.""" + subprocess.run(["afplay", SYSTEM_SOUND]) + +# Global session tracker +current_session_id = None + +# Load the openWakeWord model using ONNX +model = Model( + wakeword_models=[WAKE_WORD], + inference_framework="onnx" +) + +# Audio setup for openWakeWord +CHUNK = 1280 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 + +audio = pyaudio.PyAudio() +stream = audio.open(format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK) + +# Speech recognition setup +recognizer = sr.Recognizer() + +def get_latest_session_id(): + """Retrieve the UUID of the most recent Gemini session.""" + try: + result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True) + # Match UUID inside brackets in the first session line (e.g., [c16895c1-...]) + match = re.search(r"1\..*?\[(.*?)\]", result.stdout) + if match: + return match.group(1) + except Exception as e: + print(f"Error fetching session ID: {e}") + return None + +def speak_text(text): + """Use Google Text-to-Speech (gTTS) for high-quality audio.""" + if not text or text.strip() == "": + return + + # Remove markdown for cleaner speech + clean_text = text.replace("*", "").replace("#", "").replace("`", "") + + print(f"[Jarvis] Generating high-quality audio...") + try: + # Generate speech using gTTS + tts = gTTS(text=clean_text, lang='en') + + # Save to a memory-based byte stream instead of a file + fp = io.BytesIO() + tts.write_to_fp(fp) + fp.seek(0) + + # Play using pygame + pygame.mixer.music.load(fp) + pygame.mixer.music.play() + + # Wait until playback is finished + while pygame.mixer.music.get_busy(): + pygame.time.Clock().tick(10) + + except Exception as e: + print(f"Error in TTS: {e}") + # Fallback to 'say' command if gTTS fails (e.g. offline) + subprocess.run(["say", clean_text]) + +def run_gemini(command): + """Call the gemini CLI, capture output, and speak it.""" + global current_session_id + + args = ["gemini", "--prompt", command, "--yolo"] + + if current_session_id: + args.extend(["--resume", current_session_id]) + print(f"\n[Jarvis] Continuing session {current_session_id}...") + else: + print(f"\n[Jarvis] Starting new conversation session...") + + print(f"[Jarvis] Executing: {' '.join(args)}") + + try: + # Capture stdout to speak it, but still let it print to the console + process = subprocess.run(args, capture_output=True, text=True) + response = process.stdout.strip() + + if response: + print(f"\n[Gemini Response]:\n{response}") + speak_text(response) + + # After the first successful call, capture the session ID + if not current_session_id: + time.sleep(1) + current_session_id = get_latest_session_id() + if current_session_id: + print(f"[Jarvis] Session locked: {current_session_id}") + + except Exception as e: + print(f"Error running gemini: {e}") + +print(f"Listening for '{WAKE_WORD}'...") + +try: + while True: + # 1. Listen for Wake Word + data = stream.read(CHUNK, exception_on_overflow=False) + audio_frame = np.frombuffer(data, dtype=np.int16) + prediction = model.predict(audio_frame) + + if prediction[WAKE_WORD] > SENSITIVITY: + print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})") + play_sound() + + # 2. Capture Command + print("[Jarvis] Listening for command...") + + with sr.Microphone() as source: + recognizer.adjust_for_ambient_noise(source, duration=0.5) + try: + audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10) + print("[Jarvis] Transcribing...") + command = recognizer.recognize_google(audio_cmd) + print(f"[Jarvis] You said: {command}") + + # 3. Execute + run_gemini(command) + + except sr.WaitTimeoutError: + print("[Jarvis] No command detected.") + except sr.UnknownValueError: + print("[Jarvis] Could not understand audio.") + except sr.RequestError as e: + print(f"[Jarvis] Speech service error: {e}") + + print(f"\nListening for '{WAKE_WORD}'...") + +except KeyboardInterrupt: + print("\nStopping...") +finally: + stream.stop_stream() + stream.close() + audio.terminate() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..737cc00 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +openwakeword +pyaudio +requests +SpeechRecognition +numpy +onnxruntime