feat: Update narrator.py and capture.py to reflect ElevenLabs API updates
This commit addresses changes made to update the ElevenLabs API version as per pull request #51 (Update narrator.py to reflect API updates #51 - https://github.com/cbh123/narrator/pull/51). The following changes have been made: narrator.py: - Updated the ElevenLabs client instantiation to the new API format. - Removed the deprecated `set_api_key` and `get_api_key` methods and replaced them with the `ElevenLabs` class instantiation. - Modified the `play_audio` function to handle the audio generator properly by collecting the audio data into a bytes-like object before writing it to a file and playing it. - Added detailed docstrings and comments for better understanding and maintenance of the code. - Ensured that the OpenAI client uses the correct API key and updated the image analysis to handle responses accurately. capture.py: - Ensured the frames folder is created if it doesn't exist. - Updated the webcam initialization check and added a wait time for the camera to adjust light levels. - Adjusted the image resizing logic to improve performance before saving the frame. - Added detailed print statements and comments for clarity and debugging purposes. These changes ensure compatibility with the latest ElevenLabs API and improve the overall robustness and readability of the code.
This commit is contained in:
parent
b80925fb4d
commit
b3e600377c
|
@ -3,3 +3,6 @@
|
|||
/narration
|
||||
/frames/*
|
||||
!/frames/.gitkeep
|
||||
|
||||
# DS_STORE
|
||||
.DS_Store
|
22
capture.py
22
capture.py
|
@ -24,21 +24,21 @@ time.sleep(2)
|
|||
while True:
|
||||
ret, frame = cap.read()
|
||||
if ret:
|
||||
# Convert the frame to a PIL image
|
||||
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
# Resize the image before saving to improve performance
|
||||
max_size = 400
|
||||
height, width = frame.shape[:2]
|
||||
if height > width:
|
||||
new_height = max_size
|
||||
new_width = int((max_size / height) * width)
|
||||
else:
|
||||
new_width = max_size
|
||||
new_height = int((max_size / width) * height)
|
||||
|
||||
# Resize the image
|
||||
max_size = 250
|
||||
ratio = max_size / max(pil_img.size)
|
||||
new_size = tuple([int(x*ratio) for x in pil_img.size])
|
||||
resized_img = pil_img.resize(new_size, Image.LANCZOS)
|
||||
|
||||
# Convert the PIL image back to an OpenCV image
|
||||
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
|
||||
frame = cv2.resize(frame, (new_width, new_height))
|
||||
|
||||
# Save the frame as an image file
|
||||
print("📸 Say cheese! Saving frame.")
|
||||
path = f"{folder}/frame.jpg"
|
||||
path = os.path.join(frames_dir, "frame.jpg")
|
||||
cv2.imwrite(path, frame)
|
||||
else:
|
||||
print("Failed to capture image")
|
||||
|
|
98
narrator.py
98
narrator.py
|
@ -2,19 +2,32 @@ import os
|
|||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
import base64
|
||||
import json
|
||||
# import json
|
||||
import time
|
||||
import simpleaudio as sa
|
||||
# import simpleaudio as sa
|
||||
import errno
|
||||
from elevenlabs import generate, play, set_api_key, voices
|
||||
from elevenlabs import play, Voice
|
||||
from elevenlabs.client import ElevenLabs
|
||||
|
||||
# Load environment variables from a .env file
|
||||
load_dotenv()
|
||||
|
||||
client = OpenAI()
|
||||
|
||||
set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
|
||||
# Initialize OpenAI and ElevenLabs clients
|
||||
clientOA = OpenAI()
|
||||
clientEL = ElevenLabs(
|
||||
api_key=os.environ.get("ELEVENLABS_API_KEY")
|
||||
)
|
||||
|
||||
def encode_image(image_path):
|
||||
"""
|
||||
Encodes an image to base64.
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
str: Base64 encoded string of the image.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
with open(image_path, "rb") as image_file:
|
||||
|
@ -26,80 +39,111 @@ def encode_image(image_path):
|
|||
# File is being written to, wait a bit and retry
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def play_audio(text):
|
||||
audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID"))
|
||||
"""
|
||||
Generates and plays audio from text using ElevenLabs.
|
||||
|
||||
Args:
|
||||
text (str): The text to be converted to speech.
|
||||
"""
|
||||
# Generate audio from text
|
||||
audio_generator = clientEL.generate(text=text, voice=Voice(voice_id=os.environ.get("ELEVENLABS_VOICE_ID")))
|
||||
|
||||
# Create a unique directory for storing the audio file
|
||||
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
|
||||
dir_path = os.path.join("narration", unique_id)
|
||||
os.makedirs(dir_path, exist_ok=True)
|
||||
file_path = os.path.join(dir_path, "audio.wav")
|
||||
|
||||
# Gather audio data from generator
|
||||
audio_bytes = b''.join(audio_generator)
|
||||
|
||||
# Save audio to file
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(audio)
|
||||
|
||||
play(audio)
|
||||
f.write(audio_bytes)
|
||||
|
||||
# Play the generated audio
|
||||
play(audio_bytes)
|
||||
|
||||
def generate_new_line(base64_image):
|
||||
"""
|
||||
Generates a new line of messages for the OpenAI API call.
|
||||
|
||||
Args:
|
||||
base64_image (str): Base64 encoded string of the image.
|
||||
|
||||
Returns:
|
||||
list: A list of messages to be sent to the OpenAI API.
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image"},
|
||||
{"type": "text", "text": "Describe this image as if you are Sir David Attenborough narrating a nature documentary about homo sapiens."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": f"data:image/jpeg;base64,{base64_image}",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def analyze_image(base64_image, script):
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4-vision-preview",
|
||||
"""
|
||||
Analyzes an image using OpenAI's language model.
|
||||
|
||||
Args:
|
||||
base64_image (str): Base64 encoded string of the image.
|
||||
script (list): List of previous messages to maintain context.
|
||||
|
||||
Returns:
|
||||
str: The response text from OpenAI.
|
||||
"""
|
||||
response = clientOA.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
|
||||
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
|
||||
Be accurate, snarky, and funny. Describe what the human is actually doing. Make it short and concise, within 3 sentences. If the human is doing something remotely interesting, make a big deal about it!
|
||||
""",
|
||||
},
|
||||
]
|
||||
+ script
|
||||
+ generate_new_line(base64_image),
|
||||
max_tokens=500,
|
||||
max_tokens=150,
|
||||
temperature=0.7,
|
||||
)
|
||||
response_text = response.choices[0].message.content
|
||||
return response_text
|
||||
|
||||
|
||||
def main():
|
||||
script = []
|
||||
|
||||
while True:
|
||||
# path to your image
|
||||
# Path to your image
|
||||
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
|
||||
|
||||
# getting the base64 encoding
|
||||
# Get the base64 encoding of the image
|
||||
base64_image = encode_image(image_path)
|
||||
|
||||
# analyze posture
|
||||
# Analyze the image and generate a narration
|
||||
print("👀 David is watching...")
|
||||
analysis = analyze_image(base64_image, script=script)
|
||||
|
||||
# Print and play the narration
|
||||
print("🎙️ David says:")
|
||||
print(analysis)
|
||||
|
||||
play_audio(analysis)
|
||||
|
||||
script = script + [{"role": "assistant", "content": analysis}]
|
||||
|
||||
# wait for 5 seconds
|
||||
time.sleep(5)
|
||||
# Append the analysis to the script for context in future requests
|
||||
script.append({"role": "assistant", "content": analysis})
|
||||
|
||||
# wait for 3 seconds
|
||||
time.sleep(3)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -6,7 +6,7 @@ certifi==2023.7.22
|
|||
charset-normalizer==3.3.2
|
||||
decorator==5.1.1
|
||||
distro==1.8.0
|
||||
elevenlabs==0.2.26
|
||||
elevenlabs==1.5.0
|
||||
exceptiongroup==1.1.3
|
||||
executing==2.0.1
|
||||
h11==0.14.0
|
||||
|
|
Loading…
Reference in New Issue