Gemini vision

2023-12-14 17:15:13 +00:00 · 2023-12-14 17:15:13 +00:00 · b44e7a1269
parent 1174275b5b
commit b44e7a1269
3 changed files with 60 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), a
 ```
 export OPENAI_API_KEY=<token>
 export ELEVENLABS_API_KEY=<eleven-token>
 export GEMINI_API_KEY=<Gemini-api-key>
 ```
 Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.
@ -42,4 +43,5 @@ In another terminal, run the narrator:
 ```bash
 python narrator.py
 ```
 choose the model by selecting between 1 or 2.
--- a/narrator.py
+++ b/narrator.py
@ -6,8 +6,13 @@ import time
 import simpleaudio as sa
 import errno
 from elevenlabs import generate, play, set_api_key, voices
 import google.generativeai as genai
 import PIL.Image
 client = OpenAI()
 genai.configure(api_key = os.environ.get("GEMINI_API_KEY"))
 set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
@ -76,26 +81,63 @@ def analyze_image(base64_image, script):
 def main():
    script = []
-    while True:
+    model = input("Which model would you like to use? 1. GPT-4 Vision 2. Gemini Pro Vision \n")
        # path to your image
        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
-        # getting the base64 encoding
+    if model == "1":
-        base64_image = encode_image(image_path)
+        print("using GPT-4 Vision")
        # analyze posture
        print("👀 David is watching...")
        analysis = analyze_image(base64_image, script=script)
-        print("🎙️ David says:")
+        while True:
-        print(analysis)
+            # path to your image
            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
        play_audio(analysis)
-        script = script + [{"role": "assistant", "content": analysis}]
+            # getting the base64 encoding
            base64_image = encode_image(image_path)
            analysis = analyze_image(base64_image, script=script)
-        # wait for 5 seconds
+
-        time.sleep(5)
+            print("🎙️ David says:")
            print(analysis)
            play_audio(analysis)
            script = script + [{"role": "assistant", "content": analysis}]
            # wait for 5 seconds
            time.sleep(5)
    elif model == "2":
        print("using Gemini Pro Vision")
        print("👀 David is watching...")
        while True:
            # path to your image
            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
            img = PIL.Image.open(image_path)
            model = genai.GenerativeModel('gemini-pro-vision')
            response = model.generate_content(["""
                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
                """+"refer to these previous narrations".join(script), img])
            response_text = response.text
            print("🎙️ David says:")
            print(response_text)
            play_audio(response_text)
            script = script + [response_text]
            # wait for 5 seconds
            time.sleep(5)
    else:
        print("Please enter a valid model number")
 if __name__ == "__main__":
--- a/requirements.txt
+++ b/requirements.txt
@ -39,3 +39,4 @@ typing_extensions==4.8.0
 urllib3==2.0.7
 wcwidth==0.2.10
 websockets==12.0
 google-generativeai==0.3.1