update gitignore, change judge to narrator

2023-11-15 10:43:29 -05:00 · 2023-11-15 10:43:29 -05:00 · f2359959dd
parent b38ee783fe
commit f2359959dd
4 changed files with 122 additions and 114 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .env
 /venv
 /frames
 /narration
--- a/judge.py
+++ b/judge.py
@ -1,114 +0,0 @@
 import os
 from openai import OpenAI
 import base64
 import json
 import time
 import simpleaudio as sa
 import errno
 client = OpenAI()
 def encode_image(image_path):
    while True:
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except IOError as e:
            if e.errno != errno.EACCES:
                # Not a "file in use" error, re-raise
                raise
            # File is being written to, wait a bit and retry
            time.sleep(0.1)
 def play_audio(file_path):
    wave_obj = sa.WaveObject.from_wave_file(file_path)
    play_obj = wave_obj.play()
    play_obj.wait_done()
 def analyze_posture(base64_image):
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "system",
                "content": """
                You are a posture rater.
                I send you a profile photo of a person and you tell me roughly how their posture is.
                It's ok if you can't see the side view of the person.
                """,
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "How is my posture?"},
                    {
                        "type": "image_url",
                        "image_url": f"data:image/jpeg;base64,{base64_image}",
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    return response.choices[0].message.content
 def summarize_analysis(analysis):
    summary_response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": """
                You are a posture rater.
                You received an analysis of someone's posture, and you have to summarize it with a numerical rating 1-10.
                It's okay if you can't see the side view of the person, or if the analysis is inconclusive. You must give a 1-10 rating.
                Respond in JSON, with a "rating": 1-10, "reason": "...", and "conclusive": true/false (whether or not the analysis was conclusive)
                """,
            },
            {
                "role": "user",
                "content": analysis,
            }
        ],
        max_tokens=300,
    )
    return summary_response.choices[0].message.content
 def main():
    while True:
        # path to your image
        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
        # getting the base64 encoding
        base64_image = encode_image(image_path)
        # analyze posture
        print("🧘 Looking at your posture...")
        analysis = analyze_posture(base64_image)
        # summarize analysis
        print("🧘 Judging your posture...")
        result = summarize_analysis(analysis)
        result_json = json.loads(result)
        print(result_json)
        # play appropriate audio file based on rating
        if result_json['rating'] <= 5:
            play_audio('./assets/stop_slouching.wav')
        else:
            play_audio('./assets/wonderful_posture.wav')
        # wait for 30 seconds
        time.sleep(5)
 if __name__ == "__main__":
    main()
--- a/narrator.py
+++ b/narrator.py
@ -0,0 +1,101 @@
 import os
 from openai import OpenAI
 import base64
 import json
 import time
 import simpleaudio as sa
 import errno
 from elevenlabs import generate, play, voices
 client = OpenAI()
 def encode_image(image_path):
    while True:
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
        except IOError as e:
            if e.errno != errno.EACCES:
                # Not a "file in use" error, re-raise
                raise
            # File is being written to, wait a bit and retry
            time.sleep(0.1)
 def play_audio(text):
    audio = generate(text=text, voice="ENfvYmv6CRqDodDZTieQ", model="eleven_turbo_v2")
    unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
    dir_path = os.path.join("narration", unique_id)
    os.makedirs(dir_path, exist_ok=True)
    file_path = os.path.join(dir_path, "audio.wav")
    with open(file_path, "wb") as f:
        f.write(audio)
    play(audio)
 def generate_new_line(base64_image):
    return [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image"},
                {
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{base64_image}",
                },
            ],
        },
    ]
 def analyze_image(base64_image, script):
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "system",
                "content": """
                You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
                Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
                """,
            },
        ]
        + script
        + generate_new_line(base64_image),
        max_tokens=500,
    )
    response_text = response.choices[0].message.content
    return response_text
 def main():
    script = []
    while True:
        # path to your image
        image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
        # getting the base64 encoding
        base64_image = encode_image(image_path)
        # analyze posture
        print("👀 David is watching...")
        analysis = analyze_image(base64_image, script=script)
        print("🎙️ David says:")
        print(analysis)
        play_audio(analysis)
        script = script + [{"role": "assistant", "content": analysis}]
        # wait for 5 seconds
        time.sleep(5)
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,19 +1,39 @@
 annotated-types==0.6.0
 anyio==3.7.1
 appnope==0.1.3
 asttokens==2.4.1
 certifi==2023.7.22
 charset-normalizer==3.3.2
 decorator==5.1.1
 distro==1.8.0
 elevenlabs==0.2.26
 exceptiongroup==1.1.3
 executing==2.0.1
 h11==0.14.0
 httpcore==1.0.1
 httpx==0.25.1
 idna==3.4
 ipython==8.17.2
 jedi==0.19.1
 matplotlib-inline==0.1.6
 numpy==1.26.1
 openai==1.1.1
 opencv-python==4.8.1.78
 parso==0.8.3
 pexpect==4.8.0
 prompt-toolkit==3.0.41
 ptyprocess==0.7.0
 pure-eval==0.2.2
 pydantic==2.4.2
 pydantic_core==2.10.1
 Pygments==2.16.1
 requests==2.31.0
 six==1.16.0
 sniffio==1.3.0
 stack-data==0.6.3
 tqdm==4.66.1
 traitlets==5.13.0
 typing_extensions==4.8.0
 urllib3==2.0.7
 wcwidth==0.2.10
 websockets==12.0