Initial commit: Jarvis AI Assistant with openWakeWord and gTTS
This commit is contained in:
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.DS_Store
|
||||||
|
*.wav
|
||||||
|
*.mp3
|
||||||
43
README.md
Normal file
43
README.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Jarvis Voice Assistant (openWakeWord + Gemini CLI)
|
||||||
|
|
||||||
|
This project uses `openWakeWord` to listen for the "Hey Jarvis" wake word and then uses `SpeechRecognition` to capture a command and send it to the `gemini` CLI.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. **Install PortAudio** (macOS):
|
||||||
|
```bash
|
||||||
|
brew install portaudio
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Create and Activate Virtual Environment**:
|
||||||
|
```bash
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Install Dependencies**:
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Verify Gemini CLI**:
|
||||||
|
Ensure `gemini` is installed and available in your PATH.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Run the script:
|
||||||
|
```bash
|
||||||
|
source venv/bin/activate
|
||||||
|
python jarvis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Say "**Hey Jarvis**".
|
||||||
|
2. You will hear a "tink" sound.
|
||||||
|
3. Speak your command (e.g., "List the files in this directory" or "Check the weather").
|
||||||
|
4. The script will transcribe your command and run `gemini "<your command>"`.
|
||||||
|
|
||||||
|
## How it Works
|
||||||
|
|
||||||
|
- **openWakeWord**: Provides local, low-latency wake word detection.
|
||||||
|
- **SpeechRecognition**: Uses Google's Web Speech API for transcription.
|
||||||
|
- **Gemini CLI**: The brain that processes the commands and calls agent tools.
|
||||||
168
jarvis.py
Normal file
168
jarvis.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import openwakeword
|
||||||
|
from openwakeword.model import Model
|
||||||
|
import pyaudio
|
||||||
|
import numpy as np
|
||||||
|
import speech_recognition as sr
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from gtts import gTTS
|
||||||
|
import pygame
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
WAKE_WORD = "hey_jarvis"
|
||||||
|
SENSITIVITY = 0.5
|
||||||
|
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
|
||||||
|
|
||||||
|
# Initialize pygame mixer for audio playback
|
||||||
|
pygame.mixer.init()
|
||||||
|
|
||||||
|
def play_sound():
|
||||||
|
"""Play a system sound to indicate Jarvis is listening."""
|
||||||
|
subprocess.run(["afplay", SYSTEM_SOUND])
|
||||||
|
|
||||||
|
# Global session tracker
|
||||||
|
current_session_id = None
|
||||||
|
|
||||||
|
# Load the openWakeWord model using ONNX
|
||||||
|
model = Model(
|
||||||
|
wakeword_models=[WAKE_WORD],
|
||||||
|
inference_framework="onnx"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Audio setup for openWakeWord
|
||||||
|
CHUNK = 1280
|
||||||
|
FORMAT = pyaudio.paInt16
|
||||||
|
CHANNELS = 1
|
||||||
|
RATE = 16000
|
||||||
|
|
||||||
|
audio = pyaudio.PyAudio()
|
||||||
|
stream = audio.open(format=FORMAT,
|
||||||
|
channels=CHANNELS,
|
||||||
|
rate=RATE,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=CHUNK)
|
||||||
|
|
||||||
|
# Speech recognition setup
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
|
||||||
|
def get_latest_session_id():
|
||||||
|
"""Retrieve the UUID of the most recent Gemini session."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
|
||||||
|
# Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
|
||||||
|
match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching session ID: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def speak_text(text):
|
||||||
|
"""Use Google Text-to-Speech (gTTS) for high-quality audio."""
|
||||||
|
if not text or text.strip() == "":
|
||||||
|
return
|
||||||
|
|
||||||
|
# Remove markdown for cleaner speech
|
||||||
|
clean_text = text.replace("*", "").replace("#", "").replace("`", "")
|
||||||
|
|
||||||
|
print(f"[Jarvis] Generating high-quality audio...")
|
||||||
|
try:
|
||||||
|
# Generate speech using gTTS
|
||||||
|
tts = gTTS(text=clean_text, lang='en')
|
||||||
|
|
||||||
|
# Save to a memory-based byte stream instead of a file
|
||||||
|
fp = io.BytesIO()
|
||||||
|
tts.write_to_fp(fp)
|
||||||
|
fp.seek(0)
|
||||||
|
|
||||||
|
# Play using pygame
|
||||||
|
pygame.mixer.music.load(fp)
|
||||||
|
pygame.mixer.music.play()
|
||||||
|
|
||||||
|
# Wait until playback is finished
|
||||||
|
while pygame.mixer.music.get_busy():
|
||||||
|
pygame.time.Clock().tick(10)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in TTS: {e}")
|
||||||
|
# Fallback to 'say' command if gTTS fails (e.g. offline)
|
||||||
|
subprocess.run(["say", clean_text])
|
||||||
|
|
||||||
|
def run_gemini(command):
|
||||||
|
"""Call the gemini CLI, capture output, and speak it."""
|
||||||
|
global current_session_id
|
||||||
|
|
||||||
|
args = ["gemini", "--prompt", command, "--yolo"]
|
||||||
|
|
||||||
|
if current_session_id:
|
||||||
|
args.extend(["--resume", current_session_id])
|
||||||
|
print(f"\n[Jarvis] Continuing session {current_session_id}...")
|
||||||
|
else:
|
||||||
|
print(f"\n[Jarvis] Starting new conversation session...")
|
||||||
|
|
||||||
|
print(f"[Jarvis] Executing: {' '.join(args)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Capture stdout to speak it, but still let it print to the console
|
||||||
|
process = subprocess.run(args, capture_output=True, text=True)
|
||||||
|
response = process.stdout.strip()
|
||||||
|
|
||||||
|
if response:
|
||||||
|
print(f"\n[Gemini Response]:\n{response}")
|
||||||
|
speak_text(response)
|
||||||
|
|
||||||
|
# After the first successful call, capture the session ID
|
||||||
|
if not current_session_id:
|
||||||
|
time.sleep(1)
|
||||||
|
current_session_id = get_latest_session_id()
|
||||||
|
if current_session_id:
|
||||||
|
print(f"[Jarvis] Session locked: {current_session_id}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error running gemini: {e}")
|
||||||
|
|
||||||
|
print(f"Listening for '{WAKE_WORD}'...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
# 1. Listen for Wake Word
|
||||||
|
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||||
|
audio_frame = np.frombuffer(data, dtype=np.int16)
|
||||||
|
prediction = model.predict(audio_frame)
|
||||||
|
|
||||||
|
if prediction[WAKE_WORD] > SENSITIVITY:
|
||||||
|
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
|
||||||
|
play_sound()
|
||||||
|
|
||||||
|
# 2. Capture Command
|
||||||
|
print("[Jarvis] Listening for command...")
|
||||||
|
|
||||||
|
with sr.Microphone() as source:
|
||||||
|
recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
||||||
|
try:
|
||||||
|
audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
|
||||||
|
print("[Jarvis] Transcribing...")
|
||||||
|
command = recognizer.recognize_google(audio_cmd)
|
||||||
|
print(f"[Jarvis] You said: {command}")
|
||||||
|
|
||||||
|
# 3. Execute
|
||||||
|
run_gemini(command)
|
||||||
|
|
||||||
|
except sr.WaitTimeoutError:
|
||||||
|
print("[Jarvis] No command detected.")
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("[Jarvis] Could not understand audio.")
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print(f"[Jarvis] Speech service error: {e}")
|
||||||
|
|
||||||
|
print(f"\nListening for '{WAKE_WORD}'...")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nStopping...")
|
||||||
|
finally:
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
audio.terminate()
|
||||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
openwakeword
|
||||||
|
pyaudio
|
||||||
|
requests
|
||||||
|
SpeechRecognition
|
||||||
|
numpy
|
||||||
|
onnxruntime
|
||||||
Reference in New Issue
Block a user