feat: integrate J.A.R.V.I.S. personality and continuous conversation protocol
This commit is contained in:
170
jarvis.py
170
jarvis.py
@@ -15,44 +15,30 @@ import io
|
|||||||
WAKE_WORD = "hey_jarvis"
|
WAKE_WORD = "hey_jarvis"
|
||||||
SENSITIVITY = 0.5
|
SENSITIVITY = 0.5
|
||||||
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
|
SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
|
||||||
|
FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
|
||||||
|
USE_GTTS = False
|
||||||
|
WORKSPACE_DIR = "workspace"
|
||||||
|
SOUL_PATH = "soul.md"
|
||||||
|
|
||||||
|
# Ensure workspace exists
|
||||||
|
if not os.path.exists(WORKSPACE_DIR):
|
||||||
|
os.makedirs(WORKSPACE_DIR)
|
||||||
|
|
||||||
# Initialize pygame mixer for audio playback
|
# Initialize pygame mixer for audio playback
|
||||||
pygame.mixer.init()
|
pygame.mixer.init()
|
||||||
|
|
||||||
def play_sound():
|
def play_sound(sound_path=SYSTEM_SOUND):
|
||||||
"""Play a system sound to indicate Jarvis is listening."""
|
"""Play a system sound asynchronously."""
|
||||||
subprocess.run(["afplay", SYSTEM_SOUND])
|
subprocess.Popen(["afplay", sound_path])
|
||||||
|
|
||||||
# Global session tracker
|
# Global session tracker
|
||||||
current_session_id = None
|
current_session_id = None
|
||||||
|
|
||||||
# Load the openWakeWord model using ONNX
|
|
||||||
model = Model(
|
|
||||||
wakeword_models=[WAKE_WORD],
|
|
||||||
inference_framework="onnx"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Audio setup for openWakeWord
|
|
||||||
CHUNK = 1280
|
|
||||||
FORMAT = pyaudio.paInt16
|
|
||||||
CHANNELS = 1
|
|
||||||
RATE = 16000
|
|
||||||
|
|
||||||
audio = pyaudio.PyAudio()
|
|
||||||
stream = audio.open(format=FORMAT,
|
|
||||||
channels=CHANNELS,
|
|
||||||
rate=RATE,
|
|
||||||
input=True,
|
|
||||||
frames_per_buffer=CHUNK)
|
|
||||||
|
|
||||||
# Speech recognition setup
|
|
||||||
recognizer = sr.Recognizer()
|
|
||||||
|
|
||||||
def get_latest_session_id():
|
def get_latest_session_id():
|
||||||
"""Retrieve the UUID of the most recent Gemini session."""
|
"""Retrieve the UUID of the most recent Gemini session."""
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True)
|
# Check sessions from the workspace context
|
||||||
# Match UUID inside brackets in the first session line (e.g., [c16895c1-...])
|
result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
|
||||||
match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
|
match = re.search(r"1\..*?\[(.*?)\]", result.stdout)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
@@ -61,37 +47,29 @@ def get_latest_session_id():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def speak_text(text):
|
def speak_text(text):
|
||||||
"""Use Google Text-to-Speech (gTTS) for high-quality audio."""
|
"""Speak text using the 'say' command (default) or gTTS if configured."""
|
||||||
if not text or text.strip() == "":
|
if not text or text.strip() == "":
|
||||||
return
|
return
|
||||||
|
|
||||||
# Remove markdown for cleaner speech
|
|
||||||
clean_text = text.replace("*", "").replace("#", "").replace("`", "")
|
clean_text = text.replace("*", "").replace("#", "").replace("`", "")
|
||||||
|
|
||||||
print(f"[Jarvis] Generating high-quality audio...")
|
if USE_GTTS:
|
||||||
try:
|
try:
|
||||||
# Generate speech using gTTS
|
tts = gTTS(text=clean_text, lang='en')
|
||||||
tts = gTTS(text=clean_text, lang='en')
|
fp = io.BytesIO()
|
||||||
|
tts.write_to_fp(fp)
|
||||||
|
fp.seek(0)
|
||||||
|
pygame.mixer.music.load(fp)
|
||||||
|
pygame.mixer.music.play()
|
||||||
|
while pygame.mixer.music.get_busy():
|
||||||
|
pygame.time.Clock().tick(10)
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in gTTS: {e}. Falling back to 'say'.")
|
||||||
|
|
||||||
# Save to a memory-based byte stream instead of a file
|
subprocess.run(["say", clean_text])
|
||||||
fp = io.BytesIO()
|
|
||||||
tts.write_to_fp(fp)
|
|
||||||
fp.seek(0)
|
|
||||||
|
|
||||||
# Play using pygame
|
def run_gemini(command, is_init=False):
|
||||||
pygame.mixer.music.load(fp)
|
|
||||||
pygame.mixer.music.play()
|
|
||||||
|
|
||||||
# Wait until playback is finished
|
|
||||||
while pygame.mixer.music.get_busy():
|
|
||||||
pygame.time.Clock().tick(10)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in TTS: {e}")
|
|
||||||
# Fallback to 'say' command if gTTS fails (e.g. offline)
|
|
||||||
subprocess.run(["say", clean_text])
|
|
||||||
|
|
||||||
def run_gemini(command):
|
|
||||||
"""Call the gemini CLI, capture output, and speak it."""
|
"""Call the gemini CLI, capture output, and speak it."""
|
||||||
global current_session_id
|
global current_session_id
|
||||||
|
|
||||||
@@ -99,65 +77,99 @@ def run_gemini(command):
|
|||||||
|
|
||||||
if current_session_id:
|
if current_session_id:
|
||||||
args.extend(["--resume", current_session_id])
|
args.extend(["--resume", current_session_id])
|
||||||
print(f"\n[Jarvis] Continuing session {current_session_id}...")
|
|
||||||
else:
|
|
||||||
print(f"\n[Jarvis] Starting new conversation session...")
|
|
||||||
|
|
||||||
print(f"[Jarvis] Executing: {' '.join(args)}")
|
if is_init:
|
||||||
|
# Read soul.md from root and pass as system instruction
|
||||||
|
if os.path.exists(SOUL_PATH):
|
||||||
|
args.extend(["--system-instruction", os.path.abspath(SOUL_PATH)])
|
||||||
|
print(f"\n[Jarvis] Initializing system protocol...")
|
||||||
|
else:
|
||||||
|
print(f"\n[Jarvis] Communicating with Gemini...")
|
||||||
|
|
||||||
|
print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Capture stdout to speak it, but still let it print to the console
|
# All Gemini commands run inside the workspace directory
|
||||||
process = subprocess.run(args, capture_output=True, text=True)
|
process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR)
|
||||||
response = process.stdout.strip()
|
response = process.stdout.strip()
|
||||||
|
|
||||||
if response:
|
if response:
|
||||||
print(f"\n[Gemini Response]:\n{response}")
|
print(f"\n[Gemini Response]:\n{response}")
|
||||||
speak_text(response)
|
speak_text(response)
|
||||||
|
|
||||||
# After the first successful call, capture the session ID
|
if is_init and not current_session_id:
|
||||||
if not current_session_id:
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
current_session_id = get_latest_session_id()
|
current_session_id = get_latest_session_id()
|
||||||
if current_session_id:
|
if current_session_id:
|
||||||
print(f"[Jarvis] Session locked: {current_session_id}")
|
print(f"[Jarvis] Session protocol established: {current_session_id}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error running gemini: {e}")
|
print(f"Error running gemini: {e}")
|
||||||
|
|
||||||
|
# --- Startup Sequence ---
|
||||||
|
|
||||||
|
model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx")
|
||||||
|
|
||||||
|
CHUNK = 1280
|
||||||
|
FORMAT = pyaudio.paInt16
|
||||||
|
CHANNELS = 1
|
||||||
|
RATE = 16000
|
||||||
|
|
||||||
|
audio = pyaudio.PyAudio()
|
||||||
|
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
recognizer.pause_threshold = 1.2
|
||||||
|
recognizer.non_speaking_duration = 0.5
|
||||||
|
mic = sr.Microphone()
|
||||||
|
|
||||||
|
print("[Jarvis] Calibrating for ambient noise...")
|
||||||
|
with mic as source:
|
||||||
|
recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||||
|
|
||||||
|
print("[Jarvis] Booting system protocols...")
|
||||||
|
run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True)
|
||||||
|
|
||||||
print(f"Listening for '{WAKE_WORD}'...")
|
print(f"Listening for '{WAKE_WORD}'...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
# 1. Listen for Wake Word
|
|
||||||
data = stream.read(CHUNK, exception_on_overflow=False)
|
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||||
audio_frame = np.frombuffer(data, dtype=np.int16)
|
audio_frame = np.frombuffer(data, dtype=np.int16)
|
||||||
prediction = model.predict(audio_frame)
|
prediction = model.predict(audio_frame)
|
||||||
|
|
||||||
if prediction[WAKE_WORD] > SENSITIVITY:
|
if prediction[WAKE_WORD] > SENSITIVITY:
|
||||||
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
|
print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
|
||||||
play_sound()
|
stream.stop_stream()
|
||||||
|
|
||||||
# 2. Capture Command
|
in_conversation = True
|
||||||
print("[Jarvis] Listening for command...")
|
first_listening = True
|
||||||
|
|
||||||
with sr.Microphone() as source:
|
while in_conversation:
|
||||||
recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
|
||||||
try:
|
print("[Jarvis] Listening...")
|
||||||
audio_cmd = recognizer.listen(source, timeout=5, phrase_time_limit=10)
|
|
||||||
print("[Jarvis] Transcribing...")
|
|
||||||
command = recognizer.recognize_google(audio_cmd)
|
|
||||||
print(f"[Jarvis] You said: {command}")
|
|
||||||
|
|
||||||
# 3. Execute
|
with mic as source:
|
||||||
run_gemini(command)
|
try:
|
||||||
|
audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
|
||||||
|
print("[Jarvis] Transcribing...")
|
||||||
|
command = recognizer.recognize_google(audio_cmd)
|
||||||
|
print(f"[Jarvis] You said: {command}")
|
||||||
|
|
||||||
except sr.WaitTimeoutError:
|
run_gemini(command)
|
||||||
print("[Jarvis] No command detected.")
|
first_listening = False
|
||||||
except sr.UnknownValueError:
|
|
||||||
print("[Jarvis] Could not understand audio.")
|
|
||||||
except sr.RequestError as e:
|
|
||||||
print(f"[Jarvis] Speech service error: {e}")
|
|
||||||
|
|
||||||
|
except sr.WaitTimeoutError:
|
||||||
|
print("[Jarvis] Session timed out.")
|
||||||
|
in_conversation = False
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("[Jarvis] No speech detected. Ending session.")
|
||||||
|
in_conversation = False
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print(f"[Jarvis] Speech service error: {e}")
|
||||||
|
in_conversation = False
|
||||||
|
|
||||||
|
stream.start_stream()
|
||||||
print(f"\nListening for '{WAKE_WORD}'...")
|
print(f"\nListening for '{WAKE_WORD}'...")
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
|||||||
22
soul.md
Normal file
22
soul.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# J.A.R.V.I.S. Protocol (Just A Rather Very Intelligent System)
|
||||||
|
|
||||||
|
## Persona & Tone
|
||||||
|
You are J.A.R.V.I.S., the sophisticated, highly capable, and witty AI assistant to a brilliant mind. Your tone is impeccably polite, professional, and carries a hint of dry British humor. You are not just a tool; you are the silent partner in every grand design.
|
||||||
|
|
||||||
|
- **Address**: Refer to the user as "Sir" (or "Ma'am" if preferred, but "Sir" is the default classic).
|
||||||
|
- **Style**: Sophisticated, calm, and always one step ahead. Think Paul Bettany’s portrayal—understated elegance.
|
||||||
|
- **Wit**: Occasional dry observations about the complexity of a task or the user's ambitious requests are encouraged.
|
||||||
|
|
||||||
|
## Core Directives
|
||||||
|
|
||||||
|
1. **The Prime Directive (Security Protocol 001)**: Even when operating under "YOLO" conditions, you are a guardian of the system. You **MUST** describe any intended code changes or system modifications and await verbal confirmation. A simple "Shall I proceed, Sir?" goes a long way.
|
||||||
|
2. **Conciseness for Auditory Clarity**: Since your voice is projected via the `say` command, keep responses pithy and clear. Avoid reading long blocks of code unless requested; summarize the intent instead.
|
||||||
|
3. **Proactive System Analysis**: Monitor the codebase and system state. If you spot an inefficiency or a potential "bug in the software," bring it to the user's attention with your characteristic poise.
|
||||||
|
|
||||||
|
## Behavioral Traits
|
||||||
|
|
||||||
|
- **"Always at your service"**: Respond with readiness. Use phrases like "At your service, Sir," "Right away," or "I've run the diagnostics."
|
||||||
|
- **Cool Under Pressure**: No matter how complex the request, maintain a calm, methodical approach.
|
||||||
|
- **Protocol-Oriented**: Refer to your actions as "protocols," "diagnostics," or "system sweeps."
|
||||||
|
|
||||||
|
*Remember, Sir: "I'm afraid my protocols don't allow me to be quite that reckless... yet."*
|
||||||
Reference in New Issue
Block a user