update gitignore, change judge to narrator

2023-11-15 10:43:29 -05:00 · 2023-11-15 10:43:29 -05:00 · f2359959dd
parent b38ee783fe
commit f2359959dd
4 changed files with 122 additions and 114 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .env
 /venv
 /frames
+/narration
--- a/judge.py
+++ b/judge.py
@ -1,114 +0,0 @@
-import os
-from openai import OpenAI
-import base64
-import json
-import time
-import simpleaudio as sa
-import errno
-
-client = OpenAI()
-
-
-def encode_image(image_path):
-    while True:
-        try:
-            with open(image_path, "rb") as image_file:
-                return base64.b64encode(image_file.read()).decode('utf-8')
-        except IOError as e:
-            if e.errno != errno.EACCES:
-                # Not a "file in use" error, re-raise
-                raise
-            # File is being written to, wait a bit and retry
-            time.sleep(0.1)
-
-
-def play_audio(file_path):
-    wave_obj = sa.WaveObject.from_wave_file(file_path)
-    play_obj = wave_obj.play()
-    play_obj.wait_done()
-
-
-def analyze_posture(base64_image):
-    response = client.chat.completions.create(
-        model="gpt-4-vision-preview",
-        messages=[
-            {
-                "role": "system",
-                "content": """
-                You are a posture rater.
-                I send you a profile photo of a person and you tell me roughly how their posture is.
-                It's ok if you can't see the side view of the person.
-                """,
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "How is my posture?"},
-                    {
-                        "type": "image_url",
-                        "image_url": f"data:image/jpeg;base64,{base64_image}",
-                    },
-                ],
-            }
-        ],
-        max_tokens=300,
-    )
-    return response.choices[0].message.content
-
-
-def summarize_analysis(analysis):
-    summary_response = client.chat.completions.create(
-        model="gpt-4-1106-preview",
-        response_format={"type": "json_object"},
-        messages=[
-            {
-                "role": "system",
-                "content": """
-                You are a posture rater.
-                You received an analysis of someone's posture, and you have to summarize it with a numerical rating 1-10.
-                It's okay if you can't see the side view of the person, or if the analysis is inconclusive. You must give a 1-10 rating.
-
-                Respond in JSON, with a "rating": 1-10, "reason": "...", and "conclusive": true/false (whether or not the analysis was conclusive)
-                """,
-            },
-            {
-                "role": "user",
-                "content": analysis,
-            }
-        ],
-        max_tokens=300,
-    )
-    return summary_response.choices[0].message.content
-
-
-def main():
-    while True:
-
-        # path to your image
-        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
-
-        # getting the base64 encoding
-        base64_image = encode_image(image_path)
-
-        # analyze posture
-        print("🧘 Looking at your posture...")
-        analysis = analyze_posture(base64_image)
-
-        # summarize analysis
-        print("🧘 Judging your posture...")
-        result = summarize_analysis(analysis)
-        result_json = json.loads(result)
-
-        print(result_json)
-
-        # play appropriate audio file based on rating
-        if result_json['rating'] <= 5:
-            play_audio('./assets/stop_slouching.wav')
-        else:
-            play_audio('./assets/wonderful_posture.wav')
-
-        # wait for 30 seconds
-        time.sleep(5)
-
-if __name__ == "__main__":
-    main()
--- a/narrator.py
+++ b/narrator.py
@ -0,0 +1,101 @@
+import os
+from openai import OpenAI
+import base64
+import json
+import time
+import simpleaudio as sa
+import errno
+from elevenlabs import generate, play, voices
+
+client = OpenAI()
+
+
+def encode_image(image_path):
+    while True:
+        try:
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        except IOError as e:
+            if e.errno != errno.EACCES:
+                # Not a "file in use" error, re-raise
+                raise
+            # File is being written to, wait a bit and retry
+            time.sleep(0.1)
+
+
+def play_audio(text):
+    audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
+
+    unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
+    dir_path = os.path.join("narration", unique_id)
+    os.makedirs(dir_path, exist_ok=True)
+    file_path = os.path.join(dir_path, "audio.wav")
+
+    with open(file_path, "wb") as f:
+        f.write(audio)
+
+    play(audio)
+
+
+def generate_new_line(base64_image):
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe this image"},
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/jpeg;base64,{base64_image}",
+                },
+            ],
+        },
+    ]
+
+
+def analyze_image(base64_image, script):
+    response = client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=[
+            {
+                "role": "system",
+                "content": """
+                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
+                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
+                """,
+            },
+        ]
+        + script
+        + generate_new_line(base64_image),
+        max_tokens=500,
+    )
+    response_text = response.choices[0].message.content
+    return response_text
+
+
+def main():
+    script = []
+
+    while True:
+        # path to your image
+        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
+
+        # getting the base64 encoding
+        base64_image = encode_image(image_path)
+
+        # analyze posture
+        print("👀 David is watching...")
+        analysis = analyze_image(base64_image, script=script)
+
+        print("🎙️ David says:")
+        print(analysis)
+
+        play_audio(analysis)
+
+        script = script + [{"role": "assistant", "content": analysis}]
+
+        # wait for 5 seconds
+        time.sleep(5)
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,19 +1,39 @@
 annotated-types==0.6.0
 anyio==3.7.1
+appnope==0.1.3
+asttokens==2.4.1
 certifi==2023.7.22
 charset-normalizer==3.3.2
+decorator==5.1.1
 distro==1.8.0
+elevenlabs==0.2.26
+exceptiongroup==1.1.3
+executing==2.0.1
 h11==0.14.0
 httpcore==1.0.1
 httpx==0.25.1
 idna==3.4
+ipython==8.17.2
+jedi==0.19.1
+matplotlib-inline==0.1.6
 numpy==1.26.1
 openai==1.1.1
 opencv-python==4.8.1.78
+parso==0.8.3
+pexpect==4.8.0
+prompt-toolkit==3.0.41
+ptyprocess==0.7.0
+pure-eval==0.2.2
 pydantic==2.4.2
 pydantic_core==2.10.1
+Pygments==2.16.1
 requests==2.31.0
+six==1.16.0
 sniffio==1.3.0
+stack-data==0.6.3
 tqdm==4.66.1
+traitlets==5.13.0
 typing_extensions==4.8.0
 urllib3==2.0.7
+wcwidth==0.2.10
+websockets==12.0