Gemini vision

2023-12-14 17:15:13 +00:00 · 2023-12-14 17:15:13 +00:00 · b44e7a1269
parent 1174275b5b
commit b44e7a1269
3 changed files with 60 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), a
 ```
 export OPENAI_API_KEY=<token>
 export ELEVENLABS_API_KEY=<eleven-token>
+export GEMINI_API_KEY=<Gemini-api-key>
 ```

 Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.
@ -42,4 +43,5 @@ In another terminal, run the narrator:
 ```bash
 python narrator.py
 ```
+choose the model by selecting between 1 or 2.

--- a/narrator.py
+++ b/narrator.py
@ -6,8 +6,13 @@ import time
 import simpleaudio as sa
 import errno
 from elevenlabs import generate, play, set_api_key, voices
+import google.generativeai as genai
+import PIL.Image
+
+

 client = OpenAI()
+genai.configure(api_key = os.environ.get("GEMINI_API_KEY"))

 set_api_key(os.environ.get("ELEVENLABS_API_KEY"))

@ -76,26 +81,63 @@ def analyze_image(base64_image, script):
 def main():
    script = []

-    while True:
-        # path to your image
-        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+    model = input("Which model would you like to use? 1. GPT-4 Vision 2. Gemini Pro Vision \n")
    
-        # getting the base64 encoding
-        base64_image = encode_image(image_path)
-
-        # analyze posture
+    if model == "1":
+        print("using GPT-4 Vision")
        print("👀 David is watching...")
-        analysis = analyze_image(base64_image, script=script)

-        print("🎙️ David says:")
-        print(analysis)
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")

-        play_audio(analysis)

-        script = script + [{"role": "assistant", "content": analysis}]
+            # getting the base64 encoding
+            base64_image = encode_image(image_path)
+            analysis = analyze_image(base64_image, script=script)

-        # wait for 5 seconds
-        time.sleep(5)
+
+            print("🎙️ David says:")
+            print(analysis)
+
+            play_audio(analysis)
+
+            script = script + [{"role": "assistant", "content": analysis}]
+
+            # wait for 5 seconds
+            time.sleep(5)
+            
+    elif model == "2":
+        print("using Gemini Pro Vision")
+        print("👀 David is watching...")
+
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+
+            img = PIL.Image.open(image_path)
+            
+            model = genai.GenerativeModel('gemini-pro-vision')
+            response = model.generate_content(["""
+                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
+                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                """+"refer to these previous narrations".join(script), img])
+
+            response_text = response.text
+
+
+            print("🎙️ David says:")
+            print(response_text)
+
+            play_audio(response_text)
+
+            script = script + [response_text]
+
+            # wait for 5 seconds
+            time.sleep(5)
+
+    else:
+        print("Please enter a valid model number")


 if __name__ == "__main__":
--- a/requirements.txt
+++ b/requirements.txt
@ -39,3 +39,4 @@ typing_extensions==4.8.0
 urllib3==2.0.7
 wcwidth==0.2.10
 websockets==12.0
+google-generativeai==0.3.1