major changes

This commit is contained in:
Roshan Venugopal 2025-05-23 16:55:54 -05:00
parent 326757f4d2
commit 2ecf20fcfa
12 changed files with 146 additions and 46 deletions

View File

@ -18,12 +18,25 @@ source venv/bin/activate
Then, install the dependencies: Then, install the dependencies:
`pip install -r requirements.txt` `pip install -r requirements.txt`
Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set your API key environment variables. The Python libraries used in this project will automatically detect and use these environment variables for authentication. Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set up your API keys.
Copy the example environment file and add your API keys:
```bash ```bash
export OPENAI_API_KEY=<your-openai-api-key> cp env.example .env
export ELEVENLABS_API_KEY=<your-elevenlabs-api-key> ```
export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
Then edit the `.env` file with your actual API keys:
```bash
# Copy this file to .env and replace with your actual API keys
# OpenAI API Key - Get from https://beta.openai.com/
OPENAI_API_KEY=your-openai-api-key-here
# ElevenLabs API Key - Get from https://elevenlabs.io/
ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
# ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here
``` ```
**Note on API Keys and Voice ID:** **Note on API Keys and Voice ID:**
@ -31,6 +44,8 @@ export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
* `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech. * `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech.
* `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID. * `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID.
The application now reads these values from a `.env` file, which keeps your sensitive API keys secure and out of version control.
## Run it! ## Run it!
```bash ```bash

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

0
capture.log Normal file
View File

View File

@ -1,50 +1,74 @@
import cv2 import cv2
import time import time
from PIL import Image from PIL import Image, ImageGrab
import numpy as np import numpy as np
import os import os
import glob
from datetime import datetime
# Folder # Folder
folder = "frames" folder = "frames"
MAX_IMAGES = 10
# Create the frames folder if it doesn't exist # Create the frames folder if it doesn't exist
frames_dir = os.path.join(os.getcwd(), folder) frames_dir = os.path.join(os.getcwd(), folder)
os.makedirs(frames_dir, exist_ok=True) os.makedirs(frames_dir, exist_ok=True)
# Initialize the webcam def cleanup_old_images():
cap = cv2.VideoCapture(0) """Keep only the 10 most recent frame images"""
frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
frame_files.sort(key=os.path.getctime, reverse=True) # Sort by creation time, newest first
# Remove older files if we have more than MAX_IMAGES
for old_file in frame_files[MAX_IMAGES:]:
try:
os.remove(old_file)
print(f"Removed old frame: {os.path.basename(old_file)}")
except OSError as e:
print(f"Error removing old frame {old_file}: {e}")
# Check if the webcam is opened correctly def get_latest_frame_path():
if not cap.isOpened(): """Get the path of the most recent frame file"""
raise IOError("Cannot open webcam") frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
if frame_files:
return max(frame_files, key=os.path.getctime)
return None
# Wait for the camera to initialize and adjust light levels print("📸 Starting screenshot capture... Capture is watching your screen!")
time.sleep(2)
print("📸 Starting image capture... Say cheese!")
while True: while True:
ret, frame = cap.read() try:
if ret: # Take a screenshot
# Convert the frame to a PIL image screenshot = ImageGrab.grab()
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Resize the image # Resize the image
max_size = 250 max_size = 250
ratio = max_size / max(pil_img.size) ratio = max_size / max(screenshot.size)
new_size = tuple([int(x*ratio) for x in pil_img.size]) new_size = tuple([int(x*ratio) for x in screenshot.size])
resized_img = pil_img.resize(new_size, Image.LANCZOS) resized_img = screenshot.resize(new_size, Image.LANCZOS)
# Convert the PIL image back to an OpenCV image # Convert the PIL image to an OpenCV image for saving
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR) frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Save the frame as an image file # Generate timestamped filename
tmp_path = os.path.join(frames_dir, "frame.tmp.jpg") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
final_path = os.path.join(frames_dir, "frame.jpg") frame_filename = f"frame_{timestamp}.jpg"
tmp_path = os.path.join(frames_dir, f"frame_{timestamp}.tmp.jpg")
final_path = os.path.join(frames_dir, frame_filename)
try: try:
cv2.imwrite(tmp_path, frame) cv2.imwrite(tmp_path, frame)
os.rename(tmp_path, final_path) os.rename(tmp_path, final_path)
# Also create/update a symlink to the latest frame for backward compatibility
latest_link = os.path.join(frames_dir, "frame.jpg")
if os.path.exists(latest_link) or os.path.islink(latest_link):
os.remove(latest_link)
os.symlink(frame_filename, latest_link)
# Clean up old images
cleanup_old_images()
except cv2.error as e: except cv2.error as e:
print(f"OpenCV error: Failed to write image: {e}") print(f"OpenCV error: Failed to write image: {e}")
except OSError as e: except OSError as e:
@ -52,12 +76,11 @@ while True:
except Exception as e: except Exception as e:
print(f"An unexpected error occurred during file operation: {e}") print(f"An unexpected error occurred during file operation: {e}")
else: except Exception as e:
print("Failed to capture image") print(f"Failed to capture screenshot: {e}")
# Wait for 2 seconds # Wait for 2 seconds
time.sleep(2) time.sleep(2)
# Release the camera and close all windows # Cleanup
cap.release()
cv2.destroyAllWindows() cv2.destroyAllWindows()

24
capture_error.log Normal file
View File

@ -0,0 +1,24 @@
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted

11
env.example Normal file
View File

@ -0,0 +1,11 @@
# Copy this file to .env and replace with your actual API keys
# OpenAI API Key - Get from https://beta.openai.com/
OPENAI_API_KEY=your-openai-api-key-here
# ElevenLabs API Key - Get from https://elevenlabs.io/
ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
# ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
# You can find available voice IDs at https://elevenlabs.io/docs/api-reference/voices
ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here

8
launch_capture.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
# Set working directory
cd /Users/roshanvenugopal/Documents/github/narrator
# Activate virtual environment and run capture
source venv/bin/activate
python capture.py

View File

@ -1,13 +1,21 @@
import os import os
from dotenv import load_dotenv
from openai import OpenAI from openai import OpenAI
import base64 import base64
import json import json
import time import time
import simpleaudio as sa import simpleaudio as sa
import errno import errno
from elevenlabs import generate, play, voices from elevenlabs.client import ElevenLabs
from elevenlabs import play
# Load environment variables from .env file
load_dotenv()
client = OpenAI() client = OpenAI()
elevenlabs_client = ElevenLabs(
api_key=os.environ.get("ELEVENLABS_API_KEY")
)
def encode_image(image_path, retries=3, delay=0.1): def encode_image(image_path, retries=3, delay=0.1):
@ -32,8 +40,12 @@ def encode_image(image_path, retries=3, delay=0.1):
def play_audio(text): def play_audio(text):
try: try:
voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
# audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2") # Generate audio using the new ElevenLabs client API
audio = generate(text=text, voice=voice_id, model="eleven_turbo_v2") audio = elevenlabs_client.generate(
text=text,
voice=voice_id,
model="eleven_turbo_v2"
)
except Exception as e: # Replace with specific ElevenLabs APIError if available except Exception as e: # Replace with specific ElevenLabs APIError if available
print(f"Error generating audio with ElevenLabs: {e}") print(f"Error generating audio with ElevenLabs: {e}")
return return
@ -62,7 +74,9 @@ def generate_new_line(base64_image):
{"type": "text", "text": "Describe this image"}, {"type": "text", "text": "Describe this image"},
{ {
"type": "image_url", "type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}", "image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
}, },
], ],
}, },
@ -91,18 +105,22 @@ def analyze_image(base64_image, script):
) )
response_text = response.choices[0].message.content response_text = response.choices[0].message.content
return response_text return response_text
except client.APIConnectionError as e:
print(f"OpenAI API Connection Error: {e}")
return "Error: Could not connect to OpenAI API."
except client.RateLimitError as e:
print(f"OpenAI API Rate Limit Error: {e}")
return "Error: OpenAI API rate limit exceeded."
except client.APIStatusError as e:
print(f"OpenAI API Status Error: {e}")
return f"Error: OpenAI API returned an error status {e.status_code}."
except Exception as e: except Exception as e:
print(f"An unexpected error occurred during OpenAI API call: {e}") if "APIConnectionError" in str(type(e)):
return "Error: An unexpected error occurred during image analysis." print(f"OpenAI API Connection Error: {e}")
return "Error: Could not connect to OpenAI API."
elif "RateLimitError" in str(type(e)):
print(f"OpenAI API Rate Limit Error: {e}")
return "Error: OpenAI API rate limit exceeded."
elif "AuthenticationError" in str(type(e)):
print(f"OpenAI API Authentication Error: {e}")
return "Error: Invalid OpenAI API key. Please check your .env file."
elif "APIStatusError" in str(type(e)):
print(f"OpenAI API Status Error: {e}")
return f"Error: OpenAI API returned an error status."
else:
print(f"An unexpected error occurred during OpenAI API call: {e}")
return "Error: An unexpected error occurred during image analysis."
def main(): def main():

View File

@ -26,8 +26,9 @@ prompt-toolkit==3.0.51
ptyprocess==0.7.0 ptyprocess==0.7.0
pure-eval==0.2.3 pure-eval==0.2.3
pydantic==2.11.5 pydantic==2.11.5
pydantic_core==2.34.1 pydantic_core==2.33.2
Pygments==2.19.1 Pygments==2.19.1
python-dotenv==1.0.0
requests==2.32.3 requests==2.32.3
simpleaudio==1.0.4 simpleaudio==1.0.4
six==1.17.0 six==1.17.0