From f2359959ddc5b655532f6eef3136a52ac223fd41 Mon Sep 17 00:00:00 2001 From: Charlie Holtz Date: Wed, 15 Nov 2023 10:43:29 -0500 Subject: [PATCH] update gitignore, change judge to narrator --- .gitignore | 1 + judge.py | 114 ----------------------------------------------- narrator.py | 101 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 20 +++++++++ 4 files changed, 122 insertions(+), 114 deletions(-) delete mode 100644 judge.py create mode 100644 narrator.py diff --git a/.gitignore b/.gitignore index 49a8ae3..bff6c45 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .env /venv /frames +/narration diff --git a/judge.py b/judge.py deleted file mode 100644 index 787cf89..0000000 --- a/judge.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -from openai import OpenAI -import base64 -import json -import time -import simpleaudio as sa -import errno - -client = OpenAI() - - -def encode_image(image_path): - while True: - try: - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') - except IOError as e: - if e.errno != errno.EACCES: - # Not a "file in use" error, re-raise - raise - # File is being written to, wait a bit and retry - time.sleep(0.1) - - -def play_audio(file_path): - wave_obj = sa.WaveObject.from_wave_file(file_path) - play_obj = wave_obj.play() - play_obj.wait_done() - - -def analyze_posture(base64_image): - response = client.chat.completions.create( - model="gpt-4-vision-preview", - messages=[ - { - "role": "system", - "content": """ - You are a posture rater. - I send you a profile photo of a person and you tell me roughly how their posture is. - It's ok if you can't see the side view of the person. - """, - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "How is my posture?"}, - { - "type": "image_url", - "image_url": f"data:image/jpeg;base64,{base64_image}", - }, - ], - } - ], - max_tokens=300, - ) - return response.choices[0].message.content - - -def summarize_analysis(analysis): - summary_response = client.chat.completions.create( - model="gpt-4-1106-preview", - response_format={"type": "json_object"}, - messages=[ - { - "role": "system", - "content": """ - You are a posture rater. - You received an analysis of someone's posture, and you have to summarize it with a numerical rating 1-10. - It's okay if you can't see the side view of the person, or if the analysis is inconclusive. You must give a 1-10 rating. - - Respond in JSON, with a "rating": 1-10, "reason": "...", and "conclusive": true/false (whether or not the analysis was conclusive) - """, - }, - { - "role": "user", - "content": analysis, - } - ], - max_tokens=300, - ) - return summary_response.choices[0].message.content - - -def main(): - while True: - - # path to your image - image_path = os.path.join(os.getcwd(), "./frames/frame.jpg") - - # getting the base64 encoding - base64_image = encode_image(image_path) - - # analyze posture - print("🧘 Looking at your posture...") - analysis = analyze_posture(base64_image) - - # summarize analysis - print("🧘 Judging your posture...") - result = summarize_analysis(analysis) - result_json = json.loads(result) - - print(result_json) - - # play appropriate audio file based on rating - if result_json['rating'] <= 5: - play_audio('./assets/stop_slouching.wav') - else: - play_audio('./assets/wonderful_posture.wav') - - # wait for 30 seconds - time.sleep(5) - -if __name__ == "__main__": - main() diff --git a/narrator.py b/narrator.py new file mode 100644 index 0000000..0435b47 --- /dev/null +++ b/narrator.py @@ -0,0 +1,101 @@ +import os +from openai import OpenAI +import base64 +import json +import time +import simpleaudio as sa +import errno +from elevenlabs import generate, play, voices + +client = OpenAI() + + +def encode_image(image_path): + while True: + try: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + except IOError as e: + if e.errno != errno.EACCES: + # Not a "file in use" error, re-raise + raise + # File is being written to, wait a bit and retry + time.sleep(0.1) + + +def play_audio(text): + audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2") + + unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=") + dir_path = os.path.join("narration", unique_id) + os.makedirs(dir_path, exist_ok=True) + file_path = os.path.join(dir_path, "audio.wav") + + with open(file_path, "wb") as f: + f.write(audio) + + play(audio) + + +def generate_new_line(base64_image): + return [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": f"data:image/jpeg;base64,{base64_image}", + }, + ], + }, + ] + + +def analyze_image(base64_image, script): + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=[ + { + "role": "system", + "content": """ + You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary. + Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it! + """, + }, + ] + + script + + generate_new_line(base64_image), + max_tokens=500, + ) + response_text = response.choices[0].message.content + return response_text + + +def main(): + script = [] + + while True: + # path to your image + image_path = os.path.join(os.getcwd(), "./frames/frame.jpg") + + # getting the base64 encoding + base64_image = encode_image(image_path) + + # analyze posture + print("👀 David is watching...") + analysis = analyze_image(base64_image, script=script) + + print("🎙️ David says:") + print(analysis) + + play_audio(analysis) + + script = script + [{"role": "assistant", "content": analysis}] + + # wait for 5 seconds + time.sleep(5) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 90ecebc..0c0a6aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,39 @@ annotated-types==0.6.0 anyio==3.7.1 +appnope==0.1.3 +asttokens==2.4.1 certifi==2023.7.22 charset-normalizer==3.3.2 +decorator==5.1.1 distro==1.8.0 +elevenlabs==0.2.26 +exceptiongroup==1.1.3 +executing==2.0.1 h11==0.14.0 httpcore==1.0.1 httpx==0.25.1 idna==3.4 +ipython==8.17.2 +jedi==0.19.1 +matplotlib-inline==0.1.6 numpy==1.26.1 openai==1.1.1 opencv-python==4.8.1.78 +parso==0.8.3 +pexpect==4.8.0 +prompt-toolkit==3.0.41 +ptyprocess==0.7.0 +pure-eval==0.2.2 pydantic==2.4.2 pydantic_core==2.10.1 +Pygments==2.16.1 requests==2.31.0 +six==1.16.0 sniffio==1.3.0 +stack-data==0.6.3 tqdm==4.66.1 +traitlets==5.13.0 typing_extensions==4.8.0 urllib3==2.0.7 +wcwidth==0.2.10 +websockets==12.0