From b44e7a12699d6c0bdba58132a3786c4df91592b5 Mon Sep 17 00:00:00 2001
From: Taradepan R <63502147+taradepan@users.noreply.github.com>
Date: Thu, 14 Dec 2023 17:15:13 +0000
Subject: [PATCH] Gemini vision

---
 README.md        |  2 ++
 narrator.py      | 72 ++++++++++++++++++++++++++++++++++++++----------
 requirements.txt |  1 +
 3 files changed, 60 insertions(+), 15 deletions(-)
diff --git a/README.md b/README.md
index c10bdcb..5e23697 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), a
 ```
 export OPENAI_API_KEY=<token>
 export ELEVENLABS_API_KEY=<eleven-token>
+export GEMINI_API_KEY=<Gemini-api-key>
 ```
 
 Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.
@@ -42,4 +43,5 @@ In another terminal, run the narrator:
 ```bash
 python narrator.py
 ```
+choose the model by selecting between 1 or 2.
 
diff --git a/narrator.py b/narrator.py
index cd086f7..a41eff8 100644
--- a/narrator.py
+++ b/narrator.py
@@ -6,8 +6,13 @@ import time
 import simpleaudio as sa
 import errno
 from elevenlabs import generate, play, set_api_key, voices
+import google.generativeai as genai
+import PIL.Image
+
+
 
 client = OpenAI()
+genai.configure(api_key = os.environ.get("GEMINI_API_KEY"))
 
 set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
 
@@ -76,26 +81,63 @@ def analyze_image(base64_image, script):
 def main():
     script = []
 
-    while True:
-        # path to your image
-        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
-
-        # getting the base64 encoding
-        base64_image = encode_image(image_path)
-
-        # analyze posture
+    model = input("Which model would you like to use? 1. GPT-4 Vision 2. Gemini Pro Vision \n")
+    
+    if model == "1":
+        print("using GPT-4 Vision")
         print("👀 David is watching...")
-        analysis = analyze_image(base64_image, script=script)
 
-        print("🎙️ David says:")
-        print(analysis)
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
 
-        play_audio(analysis)
 
-        script = script + [{"role": "assistant", "content": analysis}]
+            # getting the base64 encoding
+            base64_image = encode_image(image_path)
+            analysis = analyze_image(base64_image, script=script)
 
-        # wait for 5 seconds
-        time.sleep(5)
+
+            print("🎙️ David says:")
+            print(analysis)
+
+            play_audio(analysis)
+
+            script = script + [{"role": "assistant", "content": analysis}]
+
+            # wait for 5 seconds
+            time.sleep(5)
+            
+    elif model == "2":
+        print("using Gemini Pro Vision")
+        print("👀 David is watching...")
+
+        while True:
+            # path to your image
+            image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+
+            img = PIL.Image.open(image_path)
+            
+            model = genai.GenerativeModel('gemini-pro-vision')
+            response = model.generate_content(["""
+                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
+                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                """+"refer to these previous narrations".join(script), img])
+
+            response_text = response.text
+
+
+            print("🎙️ David says:")
+            print(response_text)
+
+            play_audio(response_text)
+
+            script = script + [response_text]
+
+            # wait for 5 seconds
+            time.sleep(5)
+
+    else:
+        print("Please enter a valid model number")
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 12cae1c..c97e8c6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -39,3 +39,4 @@ typing_extensions==4.8.0
 urllib3==2.0.7
 wcwidth==0.2.10
 websockets==12.0
+google-generativeai==0.3.1
\ No newline at end of file