Refactor: Update dependencies, improve error handling, and enhance configuration.

This commit brings several improvements to the application:

- Updates all Python dependencies in requirements.txt to their latest versions.
- Enhances file handling in capture.py by writing to a temporary file before renaming, preventing partial reads.
- Strengthens error handling for API calls (OpenAI, ElevenLabs) and file operations in both capture.py and narrator.py.
- Makes the ElevenLabs Voice ID configurable via an ELEVEN_VOICE_ID environment variable in narrator.py, with a sensible default.
- Aligns the narrator's persona in narrator.py with a "David Attenborough" style by updating the system prompt.
- Updates the README.md to remove outdated information, clarify API key usage, and include new configuration options.
- Confirms that the current audio saving mechanism is suitable for archival/logging.
- Upgrades the OpenAI model to gpt-4-turbo in narrator.py.
- Reduces console noise by making the "Say cheese!" message in capture.py print only once.

I did not add comprehensive docstrings and comments in this pass.
This commit is contained in:
google-labs-jules[bot] 2025-05-23 21:17:49 +00:00
parent f0e8421a26
commit 326757f4d2
4 changed files with 129 additions and 80 deletions

View File

@ -18,14 +18,18 @@ source venv/bin/activate
Then, install the dependencies: Then, install the dependencies:
`pip install -r requirements.txt` `pip install -r requirements.txt`
Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), and [ElevenLabs](https://elevenlabs.io) account and set your tokens: Next, make accounts with [OpenAI](https://beta.openai.com/) and [ElevenLabs](https://elevenlabs.io/) and set your API key environment variables. The Python libraries used in this project will automatically detect and use these environment variables for authentication.
``` ```bash
export OPENAI_API_KEY=<token> export OPENAI_API_KEY=<your-openai-api-key>
export ELEVENLABS_API_KEY=<eleven-token> export ELEVENLABS_API_KEY=<your-elevenlabs-api-key>
export ELEVEN_VOICE_ID=<your-elevenlabs-voice-id> # Optional, see note below
``` ```
Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API. **Note on API Keys and Voice ID:**
* `OPENAI_API_KEY`: Your API key from OpenAI, used for the vision and language model.
* `ELEVENLABS_API_KEY`: Your API key from ElevenLabs, used for text-to-speech.
* `ELEVEN_VOICE_ID`: This environment variable allows you to specify a custom voice from your ElevenLabs account. If this variable is not set, the application will default to using the voice ID "21m00Tcm4TlvDq8ikWAM". You can find your available voice IDs using the ElevenLabs [voices API](https://elevenlabs.io/docs/api-reference/voices) or by checking your account on their website. To use a custom voice, make a new voice in your ElevenLabs account and get its voice ID.
## Run it! ## Run it!

View File

@ -21,6 +21,8 @@ if not cap.isOpened():
# Wait for the camera to initialize and adjust light levels # Wait for the camera to initialize and adjust light levels
time.sleep(2) time.sleep(2)
print("📸 Starting image capture... Say cheese!")
while True: while True:
ret, frame = cap.read() ret, frame = cap.read()
if ret: if ret:
@ -37,9 +39,19 @@ while True:
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR) frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Save the frame as an image file # Save the frame as an image file
print("📸 Say cheese! Saving frame.") tmp_path = os.path.join(frames_dir, "frame.tmp.jpg")
path = f"{folder}/frame.jpg" final_path = os.path.join(frames_dir, "frame.jpg")
cv2.imwrite(path, frame)
try:
cv2.imwrite(tmp_path, frame)
os.rename(tmp_path, final_path)
except cv2.error as e:
print(f"OpenCV error: Failed to write image: {e}")
except OSError as e:
print(f"OS error: Failed to rename image: {e}")
except Exception as e:
print(f"An unexpected error occurred during file operation: {e}")
else: else:
print("Failed to capture image") print("Failed to capture image")

View File

@ -10,24 +10,35 @@ from elevenlabs import generate, play, voices
client = OpenAI() client = OpenAI()
def encode_image(image_path): def encode_image(image_path, retries=3, delay=0.1):
while True: for attempt in range(retries):
try: try:
with open(image_path, "rb") as image_file: with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8") return base64.b64encode(image_file.read()).decode("utf-8")
except FileNotFoundError:
print(f"Error: Image file not found at {image_path}. Retrying...")
time.sleep(delay)
except IOError as e: except IOError as e:
if e.errno != errno.EACCES: # Handles other I/O errors, including permission errors if they still occur
# Not a "file in use" error, re-raise print(f"IOError when trying to read {image_path}: {e}. Retrying...")
raise time.sleep(delay)
# File is being written to, wait a bit and retry except Exception as e:
time.sleep(0.1) print(f"An unexpected error occurred while encoding image {image_path}: {e}")
return None # Or raise, depending on desired behavior for unexpected errors
print(f"Failed to encode image {image_path} after {retries} retries.")
return None
def play_audio(text): def play_audio(text):
try:
voice_id = os.environ.get("ELEVEN_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
# audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2") # audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
audio = generate(text=text, voice="21m00Tcm4TlvDq8ikWAM", model="eleven_turbo_v2") audio = generate(text=text, voice=voice_id, model="eleven_turbo_v2")
except Exception as e: # Replace with specific ElevenLabs APIError if available
print(f"Error generating audio with ElevenLabs: {e}")
return
try:
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=") unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
dir_path = os.path.join("narration", unique_id) dir_path = os.path.join("narration", unique_id)
os.makedirs(dir_path, exist_ok=True) os.makedirs(dir_path, exist_ok=True)
@ -36,7 +47,11 @@ def play_audio(text):
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(audio) f.write(audio)
play(audio) play(audio) # Assuming play() is blocking; if not, ensure file is written before next step
except IOError as e:
print(f"IOError saving or playing audio file: {e}")
except Exception as e:
print(f"An unexpected error occurred during audio playback: {e}")
def generate_new_line(base64_image): def generate_new_line(base64_image):
@ -55,14 +70,16 @@ def generate_new_line(base64_image):
def analyze_image(base64_image, script): def analyze_image(base64_image, script):
if not base64_image:
return "Error: Could not encode image for analysis."
try:
response = client.chat.completions.create( response = client.chat.completions.create(
model="gpt-4-vision-preview", model="gpt-4-turbo",
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": """ "content": """
You are a teenager. Narrate the picture as a teenager. Narrate the picture in the style of a nature documentary. Be observational, insightful, and use vivid language. Maintain a respectful and engaging tone. Keep it concise. If anything interesting or unusual is observed, highlight it with a sense of wonder or intrigue.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
Also do not exceed 300 characters. Also do not exceed 300 characters.
take a deep breath and do this step by step. take a deep breath and do this step by step.
""", """,
@ -74,6 +91,18 @@ def analyze_image(base64_image, script):
) )
response_text = response.choices[0].message.content response_text = response.choices[0].message.content
return response_text return response_text
except client.APIConnectionError as e:
print(f"OpenAI API Connection Error: {e}")
return "Error: Could not connect to OpenAI API."
except client.RateLimitError as e:
print(f"OpenAI API Rate Limit Error: {e}")
return "Error: OpenAI API rate limit exceeded."
except client.APIStatusError as e:
print(f"OpenAI API Status Error: {e}")
return f"Error: OpenAI API returned an error status {e.status_code}."
except Exception as e:
print(f"An unexpected error occurred during OpenAI API call: {e}")
return "Error: An unexpected error occurred during image analysis."
def main(): def main():
@ -86,16 +115,20 @@ def main():
# getting the base64 encoding # getting the base64 encoding
base64_image = encode_image(image_path) base64_image = encode_image(image_path)
if base64_image:
# analyze posture # analyze posture
print("👀 David is watching...") print("👀 David is watching...")
analysis = analyze_image(base64_image, script=script) analysis = analyze_image(base64_image, script=script)
if analysis and not analysis.startswith("Error:") : # Check if analysis produced valid output
print("🎙️ David says:") print("🎙️ David says:")
print(analysis) print(analysis)
play_audio(analysis) play_audio(analysis)
script = script + [{"role": "assistant", "content": analysis}] script = script + [{"role": "assistant", "content": analysis}]
else:
print(analysis) # Print error message from analyze_image or if it's None
else:
print("Skipping analysis due to image encoding failure.")
# wait for 5 seconds # wait for 5 seconds
time.sleep(5) time.sleep(5)

View File

@ -1,41 +1,41 @@
annotated-types==0.6.0 annotated-types==0.7.0
anyio==3.7.1 anyio==4.9.0
appnope==0.1.3 appnope==0.1.4
asttokens==2.4.1 asttokens==3.0.0
certifi==2023.7.22 certifi==2025.4.26
charset-normalizer==3.3.2 charset-normalizer==3.4.2
decorator==5.1.1 decorator==5.2.1
distro==1.8.0 distro==1.9.0
elevenlabs==0.2.26 elevenlabs==2.1.0
exceptiongroup==1.1.3 exceptiongroup==1.3.0
executing==2.0.1 executing==2.2.0
h11==0.14.0 h11==0.16.0
httpcore==1.0.1 httpcore==1.0.9
httpx==0.25.1 httpx==0.28.1
idna==3.4 idna==3.10
ipython==8.17.2 ipython==8.36.0
jedi==0.19.1 jedi==0.19.2
matplotlib-inline==0.1.6 matplotlib-inline==0.1.7
numpy==1.26.1 numpy==2.2.6
openai==1.1.1 openai==1.82.0
opencv-python==4.8.1.78 opencv-python==4.11.0.86
parso==0.8.3 parso==0.8.4
pexpect==4.8.0 pexpect==4.9.0
Pillow==10.1.0 Pillow==11.2.1
prompt-toolkit==3.0.41 prompt-toolkit==3.0.51
ptyprocess==0.7.0 ptyprocess==0.7.0
pure-eval==0.2.2 pure-eval==0.2.3
pydantic==2.4.2 pydantic==2.11.5
pydantic_core==2.10.1 pydantic_core==2.34.1
Pygments==2.16.1 Pygments==2.19.1
requests==2.31.0 requests==2.32.3
simpleaudio==1.0.4 simpleaudio==1.0.4
six==1.16.0 six==1.17.0
sniffio==1.3.0 sniffio==1.3.1
stack-data==0.6.3 stack-data==0.6.3
tqdm==4.66.1 tqdm==4.67.1
traitlets==5.13.0 traitlets==5.14.3
typing_extensions==4.8.0 typing_extensions==4.13.2
urllib3==2.0.7 urllib3==2.4.0
wcwidth==0.2.10 wcwidth==0.2.13
websockets==12.0 websockets==15.0.1