diff --git a/capture.py b/capture.py index bc9845c..71349bd 100644 --- a/capture.py +++ b/capture.py @@ -1,8 +1,9 @@ -import cv2 -import time -from PIL import Image -import numpy as np import os +import time + +import cv2 +import numpy as np +from PIL import Image # Folder folder = "frames" @@ -30,7 +31,7 @@ while True: # Resize the image max_size = 250 ratio = max_size / max(pil_img.size) - new_size = tuple([int(x*ratio) for x in pil_img.size]) + new_size = tuple([int(x * ratio) for x in pil_img.size]) resized_img = pil_img.resize(new_size, Image.LANCZOS) # Convert the PIL image back to an OpenCV image diff --git a/narrator.py b/narrator.py index 40b12fa..3a652c3 100644 --- a/narrator.py +++ b/narrator.py @@ -1,30 +1,33 @@ import base64 import errno -import json import os import time -import simpleaudio as sa -from elevenlabs import generate, play, set_api_key, stream, voices +from elevenlabs import generate, play, set_api_key, stream from openai import OpenAI -from pynput import keyboard +from pynput import ( # Using pynput to listen for a keypress instead of native keyboard module which was requiring admin privileges + keyboard, +) client = OpenAI() set_api_key(os.environ.get("ELEVENLABS_API_KEY")) +# Initializes the variables based their respective environment variable values, defaulting to false +isStreaming = os.environ.get("ELEVENLABS_STREAMING", "false") == "true" +isPhotoBooth = os.environ.get("PHOTOBOOTH_MODE", "false") == "true" + script = [] +narrator = "Sir David Attenborough" def on_press(key): - print(f"Key {key} pressed.") if key == keyboard.Key.space: + # When space bar is pressed, run the main function which analyzes the image and generates the audio _main() def on_release(key): - print(f"Key {key} released.") - if key == keyboard.Key.esc: # Stop listener return False @@ -37,12 +40,6 @@ listener = keyboard.Listener(on_press=on_press, on_release=on_release) listener.start() -# This code initializes the variable 'isStreaming' based on the value of the environment variable 'ELEVENLABS_STREAMIMAGES'. -# If the value of 'ELEVENLABS_STREAMIMAGES' is "true", then 'isStreaming' is set to True. -# Otherwise, 'isStreaming' is set to False. -isStreaming = os.environ.get("ELEVENLABS_STREAMING", "false") == "true" - - def encode_image(image_path): while True: try: @@ -65,9 +62,11 @@ def play_audio(text): ) if isStreaming: + # Stream the audio for more real-time responsiveness stream(audio) return + # Save the audio to a file and play it unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=") dir_path = os.path.join("narration", unique_id) os.makedirs(dir_path, exist_ok=True) @@ -86,7 +85,7 @@ def generate_new_line(base64_image): "content": [ { "type": "text", - "text": "Describe this image as if you are David Attenborough", + "text": f"Describe this image as if you are {narrator}", }, { "type": "image_url", @@ -103,8 +102,8 @@ def analyze_image(base64_image, script): messages=[ { "role": "system", - "content": """ - You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary. + "content": f""" + You are {narrator}. Narrate the picture of the human as if it is a nature documentary. Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it! """, }, @@ -119,6 +118,7 @@ def analyze_image(base64_image, script): def _main(): global script + # path to your image image_path = os.path.join(os.getcwd(), "./frames/frame.jpg") @@ -126,7 +126,7 @@ def _main(): base64_image = encode_image(image_path) # analyze posture - print("👀 David is watching...") + print(f"👀 {narrator} is watching...") analysis = analyze_image(base64_image, script=script) print("🎙️ David says:") @@ -138,30 +138,18 @@ def _main(): def main(): - # script = [] - while True: - pass - # path to your image - # image_path = os.path.join(os.getcwd(), "./frames/frame.jpg") + if isPhotoBooth: + pass + else: + _main() - # # getting the base64 encoding - # base64_image = encode_image(image_path) + # wait for 5 seconds + time.sleep(5) - # # analyze posture - # print("👀 David is watching...") - # analysis = analyze_image(base64_image, script=script) - - # print("🎙️ David says:") - # print(analysis) - - # play_audio(analysis) - - # script = script + [{"role": "assistant", "content": analysis}] - - # # wait for 5 seconds - # time.sleep(5) +if isPhotoBooth: + print(f"Press the spacebar to trigger {narrator}") if __name__ == "__main__": main()