feat: Update narrator.py and capture.py to reflect ElevenLabs API updates

This commit addresses changes made to update the ElevenLabs API version as per pull request #51 (Update narrator.py to reflect API updates #51 - https://github.com/cbh123/narrator/pull/51). The following changes have been made:

narrator.py:
- Updated the ElevenLabs client instantiation to the new API format.
- Removed the deprecated `set_api_key` and `get_api_key` methods and replaced them with the `ElevenLabs` class instantiation.
- Modified the `play_audio` function to handle the audio generator properly by collecting the audio data into a bytes-like object before writing it to a file and playing it.
- Added detailed docstrings and comments for better understanding and maintenance of the code.
- Ensured that the OpenAI client uses the correct API key and updated the image analysis to handle responses accurately.

capture.py:
- Ensured the frames folder is created if it doesn't exist.
- Updated the webcam initialization check and added a wait time for the camera to adjust light levels.
- Adjusted the image resizing logic to improve performance before saving the frame.
- Added detailed print statements and comments for clarity and debugging purposes.

These changes ensure compatibility with the latest ElevenLabs API and improve the overall robustness and readability of the code.
This commit is contained in:
Matthew Gennings 2024-07-22 16:38:04 -05:00
parent b80925fb4d
commit b3e600377c
4 changed files with 89 additions and 42 deletions

3
.gitignore vendored
View File

@ -3,3 +3,6 @@
/narration /narration
/frames/* /frames/*
!/frames/.gitkeep !/frames/.gitkeep
# DS_STORE
.DS_Store

View File

@ -24,21 +24,21 @@ time.sleep(2)
while True: while True:
ret, frame = cap.read() ret, frame = cap.read()
if ret: if ret:
# Convert the frame to a PIL image # Resize the image before saving to improve performance
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) max_size = 400
height, width = frame.shape[:2]
if height > width:
new_height = max_size
new_width = int((max_size / height) * width)
else:
new_width = max_size
new_height = int((max_size / width) * height)
# Resize the image frame = cv2.resize(frame, (new_width, new_height))
max_size = 250
ratio = max_size / max(pil_img.size)
new_size = tuple([int(x*ratio) for x in pil_img.size])
resized_img = pil_img.resize(new_size, Image.LANCZOS)
# Convert the PIL image back to an OpenCV image
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Save the frame as an image file # Save the frame as an image file
print("📸 Say cheese! Saving frame.") print("📸 Say cheese! Saving frame.")
path = f"{folder}/frame.jpg" path = os.path.join(frames_dir, "frame.jpg")
cv2.imwrite(path, frame) cv2.imwrite(path, frame)
else: else:
print("Failed to capture image") print("Failed to capture image")

View File

@ -2,19 +2,32 @@ import os
from dotenv import load_dotenv from dotenv import load_dotenv
from openai import OpenAI from openai import OpenAI
import base64 import base64
import json # import json
import time import time
import simpleaudio as sa # import simpleaudio as sa
import errno import errno
from elevenlabs import generate, play, set_api_key, voices from elevenlabs import play, Voice
from elevenlabs.client import ElevenLabs
# Load environment variables from a .env file
load_dotenv() load_dotenv()
client = OpenAI() # Initialize OpenAI and ElevenLabs clients
clientOA = OpenAI()
set_api_key(os.environ.get("ELEVENLABS_API_KEY")) clientEL = ElevenLabs(
api_key=os.environ.get("ELEVENLABS_API_KEY")
)
def encode_image(image_path): def encode_image(image_path):
"""
Encodes an image to base64.
Args:
image_path (str): The path to the image file.
Returns:
str: Base64 encoded string of the image.
"""
while True: while True:
try: try:
with open(image_path, "rb") as image_file: with open(image_path, "rb") as image_file:
@ -26,80 +39,111 @@ def encode_image(image_path):
# File is being written to, wait a bit and retry # File is being written to, wait a bit and retry
time.sleep(0.1) time.sleep(0.1)
def play_audio(text): def play_audio(text):
audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID")) """
Generates and plays audio from text using ElevenLabs.
Args:
text (str): The text to be converted to speech.
"""
# Generate audio from text
audio_generator = clientEL.generate(text=text, voice=Voice(voice_id=os.environ.get("ELEVENLABS_VOICE_ID")))
# Create a unique directory for storing the audio file
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=") unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
dir_path = os.path.join("narration", unique_id) dir_path = os.path.join("narration", unique_id)
os.makedirs(dir_path, exist_ok=True) os.makedirs(dir_path, exist_ok=True)
file_path = os.path.join(dir_path, "audio.wav") file_path = os.path.join(dir_path, "audio.wav")
# Gather audio data from generator
audio_bytes = b''.join(audio_generator)
# Save audio to file
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(audio) f.write(audio_bytes)
play(audio)
# Play the generated audio
play(audio_bytes)
def generate_new_line(base64_image): def generate_new_line(base64_image):
"""
Generates a new line of messages for the OpenAI API call.
Args:
base64_image (str): Base64 encoded string of the image.
Returns:
list: A list of messages to be sent to the OpenAI API.
"""
return [ return [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "Describe this image"}, {"type": "text", "text": "Describe this image as if you are Sir David Attenborough narrating a nature documentary about homo sapiens."},
{ {
"type": "image_url", "type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}", "image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
}, },
], ],
}, },
] ]
def analyze_image(base64_image, script): def analyze_image(base64_image, script):
response = client.chat.completions.create( """
model="gpt-4-vision-preview", Analyzes an image using OpenAI's language model.
Args:
base64_image (str): Base64 encoded string of the image.
script (list): List of previous messages to maintain context.
Returns:
str: The response text from OpenAI.
"""
response = clientOA.chat.completions.create(
model="gpt-4o-mini",
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": """ "content": """
You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary. You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it! Be accurate, snarky, and funny. Describe what the human is actually doing. Make it short and concise, within 3 sentences. If the human is doing something remotely interesting, make a big deal about it!
""", """,
}, },
] ]
+ script + script
+ generate_new_line(base64_image), + generate_new_line(base64_image),
max_tokens=500, max_tokens=150,
temperature=0.7,
) )
response_text = response.choices[0].message.content response_text = response.choices[0].message.content
return response_text return response_text
def main(): def main():
script = [] script = []
while True: while True:
# path to your image # Path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg") image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
# getting the base64 encoding # Get the base64 encoding of the image
base64_image = encode_image(image_path) base64_image = encode_image(image_path)
# analyze posture # Analyze the image and generate a narration
print("👀 David is watching...") print("👀 David is watching...")
analysis = analyze_image(base64_image, script=script) analysis = analyze_image(base64_image, script=script)
# Print and play the narration
print("🎙️ David says:") print("🎙️ David says:")
print(analysis) print(analysis)
play_audio(analysis) play_audio(analysis)
script = script + [{"role": "assistant", "content": analysis}] # Append the analysis to the script for context in future requests
script.append({"role": "assistant", "content": analysis})
# wait for 5 seconds
time.sleep(5)
# wait for 3 seconds
time.sleep(3)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -6,7 +6,7 @@ certifi==2023.7.22
charset-normalizer==3.3.2 charset-normalizer==3.3.2
decorator==5.1.1 decorator==5.1.1
distro==1.8.0 distro==1.8.0
elevenlabs==0.2.26 elevenlabs==1.5.0
exceptiongroup==1.1.3 exceptiongroup==1.1.3
executing==2.0.1 executing==2.0.1
h11==0.14.0 h11==0.14.0