From 326757f4d20b1642d4304e267b9a31a90661bbca Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 23 May 2025 21:17:49 +0000
Subject: [PATCH] Refactor: Update dependencies, improve error handling, and
 enhance configuration.

This commit brings several improvements to the application:

- Updates all Python dependencies in requirements.txt to their latest versions.
- Enhances file handling in capture.py by writing to a temporary file before renaming, preventing partial reads.
- Strengthens error handling for API calls (OpenAI, ElevenLabs) and file operations in both capture.py and narrator.py.
- Makes the ElevenLabs Voice ID configurable via an ELEVEN_VOICE_ID environment variable in narrator.py, with a sensible default.
- Aligns the narrator's persona in narrator.py with a "David Attenborough" style by updating the system prompt.
- Updates the README.md to remove outdated information, clarify API key usage, and include new configuration options.
- Confirms that the current audio saving mechanism is suitable for archival/logging.
- Upgrades the OpenAI model to gpt-4-turbo in narrator.py.
- Reduces console noise by making the "Say cheese!" message in capture.py print only once.

I did not add comprehensive docstrings and comments in this pass.
---
 README.md        |  14 ++++---
 capture.py       |  18 +++++++--
 narrator.py      | 101 +++++++++++++++++++++++++++++++----------------
 requirements.txt |  76 +++++++++++++++++------------------
 4 files changed, 129 insertions(+), 80 deletions(-)
diff --git a/README.md b/README.md
index 33f9b66..5861d91 100644
--- a/README.md
+++ b/README.md
@@ -18,14 +18,18 @@ source venv/bin/activate
 Then, install the dependencies:
 `pip install -r requirements.txt`
 
-Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), and [ElevenLabs](https://elevenlabs.io) account and set your tokens:
+Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set your API key environment variables. The Python libraries used in this project will automatically detect and use these environment variables for authentication.
 
-```
-export OPENAI_API_KEY=<token>
-export ELEVENLABS_API_KEY=<eleven-token>
+```bash
+export OPENAI_API_KEY=<your-openai-api-key>
+export ELEVENLABS_API_KEY=<your-elevenlabs-api-key>
+export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
 ```
 
-Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API.
+**Note on API Keys and Voice ID:**
+*   `OPENAI_API_KEY`: Your API key from OpenAI, used for the vision and language model.
+*   `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech.
+*   `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID.
 
 ## Run it!
 
diff --git a/capture.py b/capture.py
index bc9845c..4c8a1de 100644
--- a/capture.py
+++ b/capture.py
@@ -21,6 +21,8 @@ if not cap.isOpened():
 # Wait for the camera to initialize and adjust light levels
 time.sleep(2)
 
+print("📸 Starting image capture... Say cheese!")
+
 while True:
     ret, frame = cap.read()
     if ret:
@@ -37,9 +39,19 @@ while True:
         frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
 
         # Save the frame as an image file
-        print("📸 Say cheese! Saving frame.")
-        path = f"{folder}/frame.jpg"
-        cv2.imwrite(path, frame)
+        tmp_path = os.path.join(frames_dir, "frame.tmp.jpg")
+        final_path = os.path.join(frames_dir, "frame.jpg")
+        
+        try:
+            cv2.imwrite(tmp_path, frame)
+            os.rename(tmp_path, final_path)
+        except cv2.error as e:
+            print(f"OpenCV error: Failed to write image: {e}")
+        except OSError as e:
+            print(f"OS error: Failed to rename image: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred during file operation: {e}")
+            
     else:
         print("Failed to capture image")
 
diff --git a/narrator.py b/narrator.py
index 6d25f8d..13866d1 100644
--- a/narrator.py
+++ b/narrator.py
@@ -10,33 +10,48 @@ from elevenlabs import generate, play, voices
 client = OpenAI()
 
 
-def encode_image(image_path):
-    while True:
+def encode_image(image_path, retries=3, delay=0.1):
+    for attempt in range(retries):
         try:
             with open(image_path, "rb") as image_file:
                 return base64.b64encode(image_file.read()).decode("utf-8")
+        except FileNotFoundError:
+            print(f"Error: Image file not found at {image_path}. Retrying...")
+            time.sleep(delay)
         except IOError as e:
-            if e.errno != errno.EACCES:
-                # Not a "file in use" error, re-raise
-                raise
-            # File is being written to, wait a bit and retry
-            time.sleep(0.1)
+            # Handles other I/O errors, including permission errors if they still occur
+            print(f"IOError when trying to read {image_path}: {e}. Retrying...")
+            time.sleep(delay)
+        except Exception as e:
+            print(f"An unexpected error occurred while encoding image {image_path}: {e}")
+            return None # Or raise, depending on desired behavior for unexpected errors
+    print(f"Failed to encode image {image_path} after {retries} retries.")
+    return None
 
 
 def play_audio(text):
-    #audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
-    audio = generate(text=text, voice="21m00Tcm4TlvDq8ikWAM", model="eleven_turbo_v2")
+    try:
+        voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
+        # audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
+        audio = generate(text=text, voice=voice_id, model="eleven_turbo_v2")
+    except Exception as e: # Replace with specific ElevenLabs APIError if available
+        print(f"Error generating audio with ElevenLabs: {e}")
+        return
 
+    try:
+        unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
+        dir_path = os.path.join("narration", unique_id)
+        os.makedirs(dir_path, exist_ok=True)
+        file_path = os.path.join(dir_path, "audio.wav")
 
-    unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
-    dir_path = os.path.join("narration", unique_id)
-    os.makedirs(dir_path, exist_ok=True)
-    file_path = os.path.join(dir_path, "audio.wav")
-
-    with open(file_path, "wb") as f:
-        f.write(audio)
-
-    play(audio)
+        with open(file_path, "wb") as f:
+            f.write(audio)
+        
+        play(audio) # Assuming play() is blocking; if not, ensure file is written before next step
+    except IOError as e:
+        print(f"IOError saving or playing audio file: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred during audio playback: {e}")
 
 
 def generate_new_line(base64_image):
@@ -55,14 +70,16 @@ def generate_new_line(base64_image):
 
 
 def analyze_image(base64_image, script):
-    response = client.chat.completions.create(
-        model="gpt-4-vision-preview",
-        messages=[
+    if not base64_image:
+        return "Error: Could not encode image for analysis."
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=[
             {
                 "role": "system",
                 "content": """
-                You are a teenager. Narrate the picture as a teenager.
-                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                Narrate the picture in the style of a nature documentary. Be observational, insightful, and use vivid language. Maintain a respectful and engaging tone. Keep it concise. If anything interesting or unusual is observed, highlight it with a sense of wonder or intrigue.
                 Also do not exceed 300 characters.
                 take a deep breath and do this step by step.
                 """,
@@ -72,8 +89,20 @@ def analyze_image(base64_image, script):
         + generate_new_line(base64_image),
         max_tokens=500,
     )
-    response_text = response.choices[0].message.content
-    return response_text
+        response_text = response.choices[0].message.content
+        return response_text
+    except client.APIConnectionError as e:
+        print(f"OpenAI API Connection Error: {e}")
+        return "Error: Could not connect to OpenAI API."
+    except client.RateLimitError as e:
+        print(f"OpenAI API Rate Limit Error: {e}")
+        return "Error: OpenAI API rate limit exceeded."
+    except client.APIStatusError as e:
+        print(f"OpenAI API Status Error: {e}")
+        return f"Error: OpenAI API returned an error status {e.status_code}."
+    except Exception as e:
+        print(f"An unexpected error occurred during OpenAI API call: {e}")
+        return "Error: An unexpected error occurred during image analysis."
 
 
 def main():
@@ -86,16 +115,20 @@ def main():
         # getting the base64 encoding
         base64_image = encode_image(image_path)
 
-        # analyze posture
-        print("👀 David is watching...")
-        analysis = analyze_image(base64_image, script=script)
+        if base64_image:
+            # analyze posture
+            print("👀 David is watching...")
+            analysis = analyze_image(base64_image, script=script)
 
-        print("🎙️ David says:")
-        print(analysis)
-
-        play_audio(analysis)
-
-        script = script + [{"role": "assistant", "content": analysis}]
+            if analysis and not analysis.startswith("Error:") : # Check if analysis produced valid output
+                print("🎙️ David says:")
+                print(analysis)
+                play_audio(analysis)
+                script = script + [{"role": "assistant", "content": analysis}]
+            else:
+                print(analysis) # Print error message from analyze_image or if it's None
+        else:
+            print("Skipping analysis due to image encoding failure.")
 
         # wait for 5 seconds
         time.sleep(5)
diff --git a/requirements.txt b/requirements.txt
index 12cae1c..1c9e847 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,41 +1,41 @@
-annotated-types==0.6.0
-anyio==3.7.1
-appnope==0.1.3
-asttokens==2.4.1
-certifi==2023.7.22
-charset-normalizer==3.3.2
-decorator==5.1.1
-distro==1.8.0
-elevenlabs==0.2.26
-exceptiongroup==1.1.3
-executing==2.0.1
-h11==0.14.0
-httpcore==1.0.1
-httpx==0.25.1
-idna==3.4
-ipython==8.17.2
-jedi==0.19.1
-matplotlib-inline==0.1.6
-numpy==1.26.1
-openai==1.1.1
-opencv-python==4.8.1.78
-parso==0.8.3
-pexpect==4.8.0
-Pillow==10.1.0
-prompt-toolkit==3.0.41
+annotated-types==0.7.0
+anyio==4.9.0
+appnope==0.1.4
+asttokens==3.0.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+decorator==5.2.1
+distro==1.9.0
+elevenlabs==2.1.0
+exceptiongroup==1.3.0
+executing==2.2.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.10
+ipython==8.36.0
+jedi==0.19.2
+matplotlib-inline==0.1.7
+numpy==2.2.6
+openai==1.82.0
+opencv-python==4.11.0.86
+parso==0.8.4
+pexpect==4.9.0
+Pillow==11.2.1
+prompt-toolkit==3.0.51
 ptyprocess==0.7.0
-pure-eval==0.2.2
-pydantic==2.4.2
-pydantic_core==2.10.1
-Pygments==2.16.1
-requests==2.31.0
+pure-eval==0.2.3
+pydantic==2.11.5
+pydantic_core==2.34.1
+Pygments==2.19.1
+requests==2.32.3
 simpleaudio==1.0.4
-six==1.16.0
-sniffio==1.3.0
+six==1.17.0
+sniffio==1.3.1
 stack-data==0.6.3
-tqdm==4.66.1
-traitlets==5.13.0
-typing_extensions==4.8.0
-urllib3==2.0.7
-wcwidth==0.2.10
-websockets==12.0
+tqdm==4.67.1
+traitlets==5.14.3
+typing_extensions==4.13.2
+urllib3==2.4.0
+wcwidth==0.2.13
+websockets==15.0.1