feat: Update narrator.py and capture.py to reflect ElevenLabs API updates

This commit addresses changes made to update the ElevenLabs API version as per pull request #51 (Update narrator.py to reflect API updates #51 - https://github.com/cbh123/narrator/pull/51). The following changes have been made: narrator.py: - Updated the ElevenLabs client instantiation to the new API format. - Removed the deprecated `set_api_key` and `get_api_key` methods and replaced them with the `ElevenLabs` class instantiation. - Modified the `play_audio` function to handle the audio generator properly by collecting the audio data into a bytes-like object before writing it to a file and playing it. - Added detailed docstrings and comments for better understanding and maintenance of the code. - Ensured that the OpenAI client uses the correct API key and updated the image analysis to handle responses accurately. capture.py: - Ensured the frames folder is created if it doesn't exist. - Updated the webcam initialization check and added a wait time for the camera to adjust light levels. - Adjusted the image resizing logic to improve performance before saving the frame. - Added detailed print statements and comments for clarity and debugging purposes. These changes ensure compatibility with the latest ElevenLabs API and improve the overall robustness and readability of the code.
2024-07-22 16:38:04 -05:00 · 2024-07-22 16:38:04 -05:00 · b3e600377c
parent b80925fb4d
commit b3e600377c
4 changed files with 89 additions and 42 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,6 @@
 /narration
 /frames/*
 !/frames/.gitkeep
 # DS_STORE
 .DS_Store
--- a/capture.py
+++ b/capture.py
@ -24,21 +24,21 @@ time.sleep(2)
 while True:
    ret, frame = cap.read()
    if ret:
-        # Convert the frame to a PIL image
+        # Resize the image before saving to improve performance
-        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        max_size = 400
        height, width = frame.shape[:2]
        if height > width:
            new_height = max_size
            new_width = int((max_size / height) * width)
        else:
            new_width = max_size
            new_height = int((max_size / width) * height)
-        # Resize the image
+        frame = cv2.resize(frame, (new_width, new_height))
        max_size = 250
        ratio = max_size / max(pil_img.size)
        new_size = tuple([int(x*ratio) for x in pil_img.size])
        resized_img = pil_img.resize(new_size, Image.LANCZOS)
        # Convert the PIL image back to an OpenCV image
        frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
        # Save the frame as an image file
        print("📸 Say cheese! Saving frame.")
-        path = f"{folder}/frame.jpg"
+        path = os.path.join(frames_dir, "frame.jpg")
        cv2.imwrite(path, frame)
    else:
        print("Failed to capture image")
--- a/narrator.py
+++ b/narrator.py
@ -2,19 +2,32 @@ import os
 from dotenv import load_dotenv
 from openai import OpenAI
 import base64
-import json
+# import json
 import time
-import simpleaudio as sa
+# import simpleaudio as sa
 import errno
-from elevenlabs import generate, play, set_api_key, voices
+from elevenlabs import play, Voice
 from elevenlabs.client import ElevenLabs
 # Load environment variables from a .env file
 load_dotenv()
-client = OpenAI()
+# Initialize OpenAI and ElevenLabs clients
-
+clientOA = OpenAI()
-set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
+clientEL = ElevenLabs(
  api_key=os.environ.get("ELEVENLABS_API_KEY")
 )
 def encode_image(image_path):
    """
    Encodes an image to base64.
    Args:
        image_path (str): The path to the image file.
    Returns:
        str: Base64 encoded string of the image.
    """
    while True:
        try:
            with open(image_path, "rb") as image_file:
@ -26,80 +39,111 @@ def encode_image(image_path):
            # File is being written to, wait a bit and retry
            time.sleep(0.1)
 def play_audio(text):
-    audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID"))
+    """
    Generates and plays audio from text using ElevenLabs.
    Args:
        text (str): The text to be converted to speech.
    """
    # Generate audio from text
    audio_generator = clientEL.generate(text=text, voice=Voice(voice_id=os.environ.get("ELEVENLABS_VOICE_ID")))
    # Create a unique directory for storing the audio file
    unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
    dir_path = os.path.join("narration", unique_id)
    os.makedirs(dir_path, exist_ok=True)
    file_path = os.path.join(dir_path, "audio.wav")
    # Gather audio data from generator
    audio_bytes = b''.join(audio_generator)
    # Save audio to file
    with open(file_path, "wb") as f:
-        f.write(audio)
+        f.write(audio_bytes)
    play(audio)
    # Play the generated audio
    play(audio_bytes)
 def generate_new_line(base64_image):
    """
    Generates a new line of messages for the OpenAI API call.
    Args:
        base64_image (str): Base64 encoded string of the image.
    Returns:
        list: A list of messages to be sent to the OpenAI API.
    """
    return [
        {
            "role": "user",
            "content": [
-                {"type": "text", "text": "Describe this image"},
+                {"type": "text", "text": "Describe this image as if you are Sir David Attenborough narrating a nature documentary about homo sapiens."},
                {
                    "type": "image_url",
-                    "image_url": f"data:image/jpeg;base64,{base64_image}",
+                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        },
    ]
 def analyze_image(base64_image, script):
-    response = client.chat.completions.create(
+    """
-        model="gpt-4-vision-preview",
+    Analyzes an image using OpenAI's language model.
    Args:
        base64_image (str): Base64 encoded string of the image.
        script (list): List of previous messages to maintain context.
    Returns:
        str: The response text from OpenAI.
    """
    response = clientOA.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": """
                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
-                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                Be accurate, snarky, and funny. Describe what the human is actually doing. Make it short and concise, within 3 sentences. If the human is doing something remotely interesting, make a big deal about it!
                """,
            },
        ]
        + script
        + generate_new_line(base64_image),
-        max_tokens=500,
+        max_tokens=150,
        temperature=0.7,
    )
    response_text = response.choices[0].message.content
    return response_text
 def main():
    script = []
    while True:
-        # path to your image
+        # Path to your image
        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
-        # getting the base64 encoding
+        # Get the base64 encoding of the image
        base64_image = encode_image(image_path)
-        # analyze posture
+        # Analyze the image and generate a narration
        print("👀 David is watching...")
        analysis = analyze_image(base64_image, script=script)
        # Print and play the narration
        print("🎙️ David says:")
        print(analysis)
        play_audio(analysis)
-        script = script + [{"role": "assistant", "content": analysis}]
+        # Append the analysis to the script for context in future requests
-
+        script.append({"role": "assistant", "content": analysis})
        # wait for 5 seconds
        time.sleep(5)
        # wait for 3 seconds
        time.sleep(3)
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -6,7 +6,7 @@ certifi==2023.7.22
 charset-normalizer==3.3.2
 decorator==5.1.1
 distro==1.8.0
-elevenlabs==0.2.26
+elevenlabs==1.5.0
 exceptiongroup==1.1.3
 executing==2.0.1
 h11==0.14.0