major changes

This commit is contained in:
Roshan Venugopal 2025-05-23 16:55:54 -05:00
parent 326757f4d2
commit 2ecf20fcfa
12 changed files with 146 additions and 46 deletions

View File

@ -18,12 +18,25 @@ source venv/bin/activate
Then, install the dependencies:
`pip install -r requirements.txt`
Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set your API key environment variables. The Python libraries used in this project will automatically detect and use these environment variables for authentication.
Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set up your API keys.
Copy the example environment file and add your API keys:
```bash
export OPENAI_API_KEY=<your-openai-api-key>
export ELEVENLABS_API_KEY=<your-elevenlabs-api-key>
export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
cp env.example .env
```
Then edit the `.env` file with your actual API keys:
```bash
# Copy this file to .env and replace with your actual API keys
# OpenAI API Key - Get from https://beta.openai.com/
OPENAI_API_KEY=your-openai-api-key-here
# ElevenLabs API Key - Get from https://elevenlabs.io/
ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
# ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here
```
**Note on API Keys and Voice ID:**
@ -31,6 +44,8 @@ export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
* `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech.
* `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID.
The application now reads these values from a `.env` file, which keeps your sensitive API keys secure and out of version control.
## Run it!
```bash

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

0
capture.log Normal file
View File

View File

@ -1,50 +1,74 @@
import cv2
import time
from PIL import Image
from PIL import Image, ImageGrab
import numpy as np
import os
import glob
from datetime import datetime
# Folder
folder = "frames"
MAX_IMAGES = 10
# Create the frames folder if it doesn't exist
frames_dir = os.path.join(os.getcwd(), folder)
os.makedirs(frames_dir, exist_ok=True)
# Initialize the webcam
cap = cv2.VideoCapture(0)
def cleanup_old_images():
"""Keep only the 10 most recent frame images"""
frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
frame_files.sort(key=os.path.getctime, reverse=True) # Sort by creation time, newest first
# Remove older files if we have more than MAX_IMAGES
for old_file in frame_files[MAX_IMAGES:]:
try:
os.remove(old_file)
print(f"Removed old frame: {os.path.basename(old_file)}")
except OSError as e:
print(f"Error removing old frame {old_file}: {e}")
# Check if the webcam is opened correctly
if not cap.isOpened():
raise IOError("Cannot open webcam")
def get_latest_frame_path():
"""Get the path of the most recent frame file"""
frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
if frame_files:
return max(frame_files, key=os.path.getctime)
return None
# Wait for the camera to initialize and adjust light levels
time.sleep(2)
print("📸 Starting image capture... Say cheese!")
print("📸 Starting screenshot capture... Capture is watching your screen!")
while True:
ret, frame = cap.read()
if ret:
# Convert the frame to a PIL image
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
try:
# Take a screenshot
screenshot = ImageGrab.grab()
# Resize the image
max_size = 250
ratio = max_size / max(pil_img.size)
new_size = tuple([int(x*ratio) for x in pil_img.size])
resized_img = pil_img.resize(new_size, Image.LANCZOS)
ratio = max_size / max(screenshot.size)
new_size = tuple([int(x*ratio) for x in screenshot.size])
resized_img = screenshot.resize(new_size, Image.LANCZOS)
# Convert the PIL image back to an OpenCV image
# Convert the PIL image to an OpenCV image for saving
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Save the frame as an image file
tmp_path = os.path.join(frames_dir, "frame.tmp.jpg")
final_path = os.path.join(frames_dir, "frame.jpg")
# Generate timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
frame_filename = f"frame_{timestamp}.jpg"
tmp_path = os.path.join(frames_dir, f"frame_{timestamp}.tmp.jpg")
final_path = os.path.join(frames_dir, frame_filename)
try:
cv2.imwrite(tmp_path, frame)
os.rename(tmp_path, final_path)
# Also create/update a symlink to the latest frame for backward compatibility
latest_link = os.path.join(frames_dir, "frame.jpg")
if os.path.exists(latest_link) or os.path.islink(latest_link):
os.remove(latest_link)
os.symlink(frame_filename, latest_link)
# Clean up old images
cleanup_old_images()
except cv2.error as e:
print(f"OpenCV error: Failed to write image: {e}")
except OSError as e:
@ -52,12 +76,11 @@ while True:
except Exception as e:
print(f"An unexpected error occurred during file operation: {e}")
else:
print("Failed to capture image")
except Exception as e:
print(f"Failed to capture screenshot: {e}")
# Wait for 2 seconds
time.sleep(2)
# Release the camera and close all windows
cap.release()
# Cleanup
cv2.destroyAllWindows()

24
capture_error.log Normal file
View File

@ -0,0 +1,24 @@
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
/bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted

11
env.example Normal file
View File

@ -0,0 +1,11 @@
# Copy this file to .env and replace with your actual API keys
# OpenAI API Key - Get from https://beta.openai.com/
OPENAI_API_KEY=your-openai-api-key-here
# ElevenLabs API Key - Get from https://elevenlabs.io/
ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
# ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
# You can find available voice IDs at https://elevenlabs.io/docs/api-reference/voices
ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here

8
launch_capture.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
# Set working directory
cd /Users/roshanvenugopal/Documents/github/narrator
# Activate virtual environment and run capture
source venv/bin/activate
python capture.py

View File

@ -1,13 +1,21 @@
import os
from dotenv import load_dotenv
from openai import OpenAI
import base64
import json
import time
import simpleaudio as sa
import errno
from elevenlabs import generate, play, voices
from elevenlabs.client import ElevenLabs
from elevenlabs import play
# Load environment variables from .env file
load_dotenv()
client = OpenAI()
elevenlabs_client = ElevenLabs(
api_key=os.environ.get("ELEVENLABS_API_KEY")
)
def encode_image(image_path, retries=3, delay=0.1):
@ -32,8 +40,12 @@ def encode_image(image_path, retries=3, delay=0.1):
def play_audio(text):
try:
voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
# audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
audio = generate(text=text, voice=voice_id, model="eleven_turbo_v2")
# Generate audio using the new ElevenLabs client API
audio = elevenlabs_client.generate(
text=text,
voice=voice_id,
model="eleven_turbo_v2"
)
except Exception as e: # Replace with specific ElevenLabs APIError if available
print(f"Error generating audio with ElevenLabs: {e}")
return
@ -62,7 +74,9 @@ def generate_new_line(base64_image):
{"type": "text", "text": "Describe this image"},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
},
@ -91,18 +105,22 @@ def analyze_image(base64_image, script):
)
response_text = response.choices[0].message.content
return response_text
except client.APIConnectionError as e:
print(f"OpenAI API Connection Error: {e}")
return "Error: Could not connect to OpenAI API."
except client.RateLimitError as e:
print(f"OpenAI API Rate Limit Error: {e}")
return "Error: OpenAI API rate limit exceeded."
except client.APIStatusError as e:
print(f"OpenAI API Status Error: {e}")
return f"Error: OpenAI API returned an error status {e.status_code}."
except Exception as e:
print(f"An unexpected error occurred during OpenAI API call: {e}")
return "Error: An unexpected error occurred during image analysis."
if "APIConnectionError" in str(type(e)):
print(f"OpenAI API Connection Error: {e}")
return "Error: Could not connect to OpenAI API."
elif "RateLimitError" in str(type(e)):
print(f"OpenAI API Rate Limit Error: {e}")
return "Error: OpenAI API rate limit exceeded."
elif "AuthenticationError" in str(type(e)):
print(f"OpenAI API Authentication Error: {e}")
return "Error: Invalid OpenAI API key. Please check your .env file."
elif "APIStatusError" in str(type(e)):
print(f"OpenAI API Status Error: {e}")
return f"Error: OpenAI API returned an error status."
else:
print(f"An unexpected error occurred during OpenAI API call: {e}")
return "Error: An unexpected error occurred during image analysis."
def main():

View File

@ -26,8 +26,9 @@ prompt-toolkit==3.0.51
ptyprocess==0.7.0
pure-eval==0.2.3
pydantic==2.11.5
pydantic_core==2.34.1
pydantic_core==2.33.2
Pygments==2.19.1
python-dotenv==1.0.0
requests==2.32.3
simpleaudio==1.0.4
six==1.17.0