From b1f9236bb6ea0a89193477b3bec93f3efb095dec Mon Sep 17 00:00:00 2001 From: Ray Smets Date: Thu, 23 Nov 2023 01:21:43 -0800 Subject: [PATCH 1/4] [Setup] script. (#1) --- README.md | 7 +++++++ setup.sh | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100755 setup.sh diff --git a/README.md b/README.md index c10bdcb..1c9b150 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,13 @@ Make a new voice in Eleven and get the voice id of that voice using their [get v export ELEVENLABS_VOICE_ID= ``` +### Setup Script + +Alternatively, one can use the `setup.sh` script to facilitate getting the shell envs ready to rock by updating the API key values in `setup.sh` and run. + +_Note: may have to manually run `source source venv/bin/activate` afterwards depending on shell env._ + + ## Run it! In on terminal, run the webcam capture: diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..bab016d --- /dev/null +++ b/setup.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# create a virtual environment +python3 -m pip install virtualenv +python3 -m virtualenv venv + +# source the virtual environment +source venv/bin/activate + +# install the dependencies +pip install -r requirements.txt + +# set the environment variables +export ELEVENLABS_VOICE_ID= +export OPENAI_API_KEY= +export ELEVENLABS_API_KEY= \ No newline at end of file From 4ab05a4b1d13dab4e047e000e78d9c897d02467d Mon Sep 17 00:00:00 2001 From: Ray Smets Date: Thu, 23 Nov 2023 01:22:52 -0800 Subject: [PATCH 2/4] [Narrator] prompt to describe the image like David Attenborough for increased complex descriptors. (#2) --- narrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narrator.py b/narrator.py index cd086f7..845158f 100644 --- a/narrator.py +++ b/narrator.py @@ -43,7 +43,7 @@ def generate_new_line(base64_image): { "role": "user", "content": [ - {"type": "text", "text": "Describe this image"}, + {"type": "text", "text": "Describe this image as if you David Attenborough"}, { "type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}", From 1bb728ada311c0892ac18f61718e6538279a3192 Mon Sep 17 00:00:00 2001 From: Ray Smets Date: Thu, 23 Nov 2023 01:45:28 -0800 Subject: [PATCH 3/4] [Narrator] fix --- narrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narrator.py b/narrator.py index 845158f..7eca4f5 100644 --- a/narrator.py +++ b/narrator.py @@ -43,7 +43,7 @@ def generate_new_line(base64_image): { "role": "user", "content": [ - {"type": "text", "text": "Describe this image as if you David Attenborough"}, + {"type": "text", "text": "Describe this image as if you are David Attenborough"}, { "type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}", From 94684fca3406adb745a29f89fd3d787cddc78852 Mon Sep 17 00:00:00 2001 From: Ray Smets Date: Thu, 23 Nov 2023 14:45:43 -0800 Subject: [PATCH 4/4] [Narrator] streaming --- README.md | 23 ++++++++++++++++------- narrator.py | 36 ++++++++++++++++++++++++++++-------- setup.sh | 4 +++- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 1c9b150..b2c7f25 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ -# David Attenborough narrates your life. +# David Attenborough narrates your life. https://twitter.com/charliebholtz/status/1724815159590293764 ## Want to make your own AI app? + Check out [Replicate](https://replicate.com). We make it easy to run machine learning models with an API. ## Setup @@ -20,33 +21,41 @@ Then, install the dependencies: Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), and [ElevenLabs](https://elevenlabs.io) account and set your tokens: -``` +```bash export OPENAI_API_KEY= export ELEVENLABS_API_KEY= ``` Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab. -``` +```bash export ELEVENLABS_VOICE_ID= ``` -### Setup Script +### Streaming -Alternatively, one can use the `setup.sh` script to facilitate getting the shell envs ready to rock by updating the API key values in `setup.sh` and run. +If you would like the speech to start quicker via a streaming manner set the environment variable to enable. The concession is that the audio snippet is not saved in the `/narration` directory. + +```bash +export ELEVENLABS_STREAMING=true +``` + +### Script + +Alternative to running the commands above individually, one can use the `setup.sh` script to facilitate getting the two required shell envs ready to rock by updating the environment variable values in `setup.sh` and executing the script. _Note: may have to manually run `source source venv/bin/activate` afterwards depending on shell env._ - ## Run it! In on terminal, run the webcam capture: + ```bash python capture.py ``` + In another terminal, run the narrator: ```bash python narrator.py ``` - diff --git a/narrator.py b/narrator.py index 7eca4f5..d33da74 100644 --- a/narrator.py +++ b/narrator.py @@ -1,16 +1,24 @@ -import os -from openai import OpenAI import base64 -import json -import time -import simpleaudio as sa import errno -from elevenlabs import generate, play, set_api_key, voices +import json +import os +import time + +import simpleaudio as sa +from elevenlabs import generate, play, set_api_key, stream, voices +from openai import OpenAI client = OpenAI() set_api_key(os.environ.get("ELEVENLABS_API_KEY")) + +# This code initializes the variable 'isStreaming' based on the value of the environment variable 'ELEVENLABS_STREAMIMAGES'. +# If the value of 'ELEVENLABS_STREAMIMAGES' is "true", then 'isStreaming' is set to True. +# Otherwise, 'isStreaming' is set to False. +isStreaming = os.environ.get("ELEVENLABS_STREAMING", "false") == "true" + + def encode_image(image_path): while True: try: @@ -25,7 +33,16 @@ def encode_image(image_path): def play_audio(text): - audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID")) + audio = generate( + text, + voice=os.environ.get("ELEVENLABS_VOICE_ID"), + model="eleven_turbo_v2", + stream=isStreaming, + ) + + if isStreaming: + stream(audio) + return unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=") dir_path = os.path.join("narration", unique_id) @@ -43,7 +60,10 @@ def generate_new_line(base64_image): { "role": "user", "content": [ - {"type": "text", "text": "Describe this image as if you are David Attenborough"}, + { + "type": "text", + "text": "Describe this image as if you are David Attenborough", + }, { "type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}", diff --git a/setup.sh b/setup.sh index bab016d..823a544 100755 --- a/setup.sh +++ b/setup.sh @@ -13,4 +13,6 @@ pip install -r requirements.txt # set the environment variables export ELEVENLABS_VOICE_ID= export OPENAI_API_KEY= -export ELEVENLABS_API_KEY= \ No newline at end of file +export ELEVENLABS_API_KEY= + +export ELEVENLABS_STREAMING=false