major changes

2025-05-23 16:55:54 -05:00 · 2025-05-23 16:55:54 -05:00 · 2ecf20fcfa
parent 326757f4d2
commit 2ecf20fcfa
12 changed files with 146 additions and 46 deletions
--- a/README.md
+++ b/README.md
@ -18,12 +18,25 @@ source venv/bin/activate
 Then, install the dependencies:
 `pip install -r requirements.txt`
-Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set your API key environment variables. The Python libraries used in this project will automatically detect and use these environment variables for authentication.
+Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set up your API keys. 
 Copy the example environment file and add your API keys:
 ```bash
-export OPENAI_API_KEY=<your-openai-api-key>
+cp env.example .env
-export ELEVENLABS_API_KEY=<your-elevenlabs-api-key>
+```
-export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
+
 Then edit the `.env` file with your actual API keys:
 ```bash
 # Copy this file to .env and replace with your actual API keys
 # OpenAI API Key - Get from https://beta.openai.com/
 OPENAI_API_KEY=your-openai-api-key-here
 # ElevenLabs API Key - Get from https://elevenlabs.io/
 ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
 # ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
 ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here
 ```
 **Note on API Keys and Voice ID:**
@ -31,6 +44,8 @@ export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
 *   `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech.
 *   `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID.
 The application now reads these values from a `.env` file, which keeps your sensitive API keys secure and out of version control.
 ## Run it!
 ```bash
--- a/assets/stop_slouching.mp3
+++ b/assets/stop_slouching.mp3
--- a/assets/stop_slouching.wav
+++ b/assets/stop_slouching.wav
--- a/assets/wonderful_posture.mp3
+++ b/assets/wonderful_posture.mp3
--- a/assets/wonderful_posture.wav
+++ b/assets/wonderful_posture.wav
--- a/capture.log
+++ b/capture.log
--- a/capture.py
+++ b/capture.py
@ -1,50 +1,74 @@
 import cv2
 import time
-from PIL import Image
+from PIL import Image, ImageGrab
 import numpy as np
 import os
 import glob
 from datetime import datetime
 # Folder
 folder = "frames"
 MAX_IMAGES = 10
 # Create the frames folder if it doesn't exist
 frames_dir = os.path.join(os.getcwd(), folder)
 os.makedirs(frames_dir, exist_ok=True)
-# Initialize the webcam
+def cleanup_old_images():
-cap = cv2.VideoCapture(0)
+    """Keep only the 10 most recent frame images"""
    frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
    frame_files.sort(key=os.path.getctime, reverse=True)  # Sort by creation time, newest first
    # Remove older files if we have more than MAX_IMAGES
    for old_file in frame_files[MAX_IMAGES:]:
        try:
            os.remove(old_file)
            print(f"Removed old frame: {os.path.basename(old_file)}")
        except OSError as e:
            print(f"Error removing old frame {old_file}: {e}")
-# Check if the webcam is opened correctly
+def get_latest_frame_path():
-if not cap.isOpened():
+    """Get the path of the most recent frame file"""
-    raise IOError("Cannot open webcam")
+    frame_files = glob.glob(os.path.join(frames_dir, "frame_*.jpg"))
    if frame_files:
        return max(frame_files, key=os.path.getctime)
    return None
-# Wait for the camera to initialize and adjust light levels
+print("📸 Starting screenshot capture... Capture is watching your screen!")
 time.sleep(2)
 print("📸 Starting image capture... Say cheese!")
 while True:
-    ret, frame = cap.read()
+    try:
-    if ret:
+        # Take a screenshot
-        # Convert the frame to a PIL image
+        screenshot = ImageGrab.grab()
-        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        
        # Resize the image
        max_size = 250
-        ratio = max_size / max(pil_img.size)
+        ratio = max_size / max(screenshot.size)
-        new_size = tuple([int(x*ratio) for x in pil_img.size])
+        new_size = tuple([int(x*ratio) for x in screenshot.size])
-        resized_img = pil_img.resize(new_size, Image.LANCZOS)
+        resized_img = screenshot.resize(new_size, Image.LANCZOS)
-        # Convert the PIL image back to an OpenCV image
+        # Convert the PIL image to an OpenCV image for saving
        frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
-        # Save the frame as an image file
+        # Generate timestamped filename
-        tmp_path = os.path.join(frames_dir, "frame.tmp.jpg")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        final_path = os.path.join(frames_dir, "frame.jpg")
+        frame_filename = f"frame_{timestamp}.jpg"
        tmp_path = os.path.join(frames_dir, f"frame_{timestamp}.tmp.jpg")
        final_path = os.path.join(frames_dir, frame_filename)
        try:
            cv2.imwrite(tmp_path, frame)
            os.rename(tmp_path, final_path)
            # Also create/update a symlink to the latest frame for backward compatibility
            latest_link = os.path.join(frames_dir, "frame.jpg")
            if os.path.exists(latest_link) or os.path.islink(latest_link):
                os.remove(latest_link)
            os.symlink(frame_filename, latest_link)
            # Clean up old images
            cleanup_old_images()
        except cv2.error as e:
            print(f"OpenCV error: Failed to write image: {e}")
        except OSError as e:
@ -52,12 +76,11 @@ while True:
        except Exception as e:
            print(f"An unexpected error occurred during file operation: {e}")
-    else:
+    except Exception as e:
-        print("Failed to capture image")
+        print(f"Failed to capture screenshot: {e}")
    # Wait for 2 seconds
    time.sleep(2)
-# Release the camera and close all windows
+# Cleanup
 cap.release()
 cv2.destroyAllWindows()
--- a/capture_error.log
+++ b/capture_error.log
@ -0,0 +1,24 @@
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
 shell-init: error retrieving current directory: getcwd: cannot access parent directories: Operation not permitted
 /bin/bash: /Users/roshanvenugopal/Documents/github/narrator/launch_capture.sh: Operation not permitted
--- a/env.example
+++ b/env.example
@ -0,0 +1,11 @@
 # Copy this file to .env and replace with your actual API keys
 # OpenAI API Key - Get from https://beta.openai.com/
 OPENAI_API_KEY=your-openai-api-key-here
 # ElevenLabs API Key - Get from https://elevenlabs.io/
 ELEVENLABS_API_KEY=your-elevenlabs-api-key-here
 # ElevenLabs Voice ID (Optional) - If not set, defaults to "21m00Tcm4TlvDq8ikWAM"
 # You can find available voice IDs at https://elevenlabs.io/docs/api-reference/voices
 ELEVEN_VOICE_ID=your-elevenlabs-voice-id-here 
--- a/launch_capture.sh
+++ b/launch_capture.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 # Set working directory
 cd /Users/roshanvenugopal/Documents/github/narrator
 # Activate virtual environment and run capture
 source venv/bin/activate
 python capture.py 
--- a/narrator.py
+++ b/narrator.py
@ -1,13 +1,21 @@
 import os
 from dotenv import load_dotenv
 from openai import OpenAI
 import base64
 import json
 import time
 import simpleaudio as sa
 import errno
-from elevenlabs import generate, play, voices
+from elevenlabs.client import ElevenLabs
 from elevenlabs import play
 # Load environment variables from .env file
 load_dotenv()
 client = OpenAI()
 elevenlabs_client = ElevenLabs(
    api_key=os.environ.get("ELEVENLABS_API_KEY")
 )
 def encode_image(image_path, retries=3, delay=0.1):
@ -32,8 +40,12 @@ def encode_image(image_path, retries=3, delay=0.1):
 def play_audio(text):
    try:
        voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
-        # audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
+        # Generate audio using the new ElevenLabs client API
-        audio = generate(text=text, voice=voice_id, model="eleven_turbo_v2")
+        audio = elevenlabs_client.generate(
            text=text,
            voice=voice_id,
            model="eleven_turbo_v2"
        )
    except Exception as e: # Replace with specific ElevenLabs APIError if available
        print(f"Error generating audio with ElevenLabs: {e}")
        return
@ -62,7 +74,9 @@ def generate_new_line(base64_image):
                {"type": "text", "text": "Describe this image"},
                {
                    "type": "image_url",
-                    "image_url": f"data:image/jpeg;base64,{base64_image}",
+                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    },
                },
            ],
        },
@ -91,18 +105,22 @@ def analyze_image(base64_image, script):
    )
        response_text = response.choices[0].message.content
        return response_text
    except client.APIConnectionError as e:
        print(f"OpenAI API Connection Error: {e}")
        return "Error: Could not connect to OpenAI API."
    except client.RateLimitError as e:
        print(f"OpenAI API Rate Limit Error: {e}")
        return "Error: OpenAI API rate limit exceeded."
    except client.APIStatusError as e:
        print(f"OpenAI API Status Error: {e}")
        return f"Error: OpenAI API returned an error status {e.status_code}."
    except Exception as e:
-        print(f"An unexpected error occurred during OpenAI API call: {e}")
+        if "APIConnectionError" in str(type(e)):
-        return "Error: An unexpected error occurred during image analysis."
+            print(f"OpenAI API Connection Error: {e}")
            return "Error: Could not connect to OpenAI API."
        elif "RateLimitError" in str(type(e)):
            print(f"OpenAI API Rate Limit Error: {e}")
            return "Error: OpenAI API rate limit exceeded."
        elif "AuthenticationError" in str(type(e)):
            print(f"OpenAI API Authentication Error: {e}")
            return "Error: Invalid OpenAI API key. Please check your .env file."
        elif "APIStatusError" in str(type(e)):
            print(f"OpenAI API Status Error: {e}")
            return f"Error: OpenAI API returned an error status."
        else:
            print(f"An unexpected error occurred during OpenAI API call: {e}")
            return "Error: An unexpected error occurred during image analysis."
 def main():
--- a/requirements.txt
+++ b/requirements.txt
@ -26,8 +26,9 @@ prompt-toolkit==3.0.51
 ptyprocess==0.7.0
 pure-eval==0.2.3
 pydantic==2.11.5
-pydantic_core==2.34.1
+pydantic_core==2.33.2
 Pygments==2.19.1
 python-dotenv==1.0.0
 requests==2.32.3
 simpleaudio==1.0.4
 six==1.17.0