Test Drive: Deepgram TTS
Published on .
Today I test drove Deepgram’s TTS offering since they give free credits. I’m making a podcast out of the East Boston Oral History that I mentioned yesterday. The process is:
- Get an API key
- Invoke
The code is very simple and returns decent audio.
from deepgram import DeepgramClient, SpeakOptions
deepgram = DeepgramClient()
def speak(text, filename):
response = deepgram.speak.v("1").save(filename, {"text": text}, SpeakOptions(model="aura-arcas-en"))
I suppose making the podcast is much more interesting. Here is the code for that, as well as a sample.
So the steps are:
- Transcribe
- Generate intro and outro text based on the transcription
- Generate speech
- Dub it all together
- Generate the feed
If I were to do this again, I would have just used the OpenAI TTS.
import glob
import pydub
from pydantic import BaseModel
from openai import OpenAI
from pathlib import Path
from deepgram import DeepgramClient, SpeakOptions
client = OpenAI()
class PodcastText(BaseModel):
intro: str
outro: str
def generate_text(text):
return client.beta.chat.completions.parse(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are the podcast host of \"East Boston Oral History Podcast\" who writes intros and outros based on the transcript."},
{"role": "user", "content": text}
],
response_format=PodcastText,
).choices[0].message.parsed
deepgram = DeepgramClient()
def speak(text, filename):
response = deepgram.speak.v("1").save(filename, {"text": text}, SpeakOptions(model="aura-arcas-en"))
return response.filename
episodes = []
for f in glob.glob('/Users/ntaylor/Downloads/East Boston Oral History/*.mp3'):
p = Path(f)
result = mlx_whisper.transcribe(f, path_or_hf_repo="mlx-community/whisper-large-v3-turbo")
text = generate_text(result['text'])
intro = speak(text.intro, 'intro.mp3')
outro = speak(text.outro, 'outro.mp3')
audio = (
pydub.AudioSegment.from_mp3("Greenway Groove.mp3")[:10000].fade_out(3000)
.append(pydub.AudioSegment.from_mp3(intro))
.append(pydub.AudioSegment.from_mp3(f)[:10000])
.append(pydub.AudioSegment.from_mp3(outro))
)
audio.export(f'Episode {p.parts[-1]}', format="mp3")
display(Audio(f'Episode {p.parts[-1]}'))
episodes.append({
'name': p.stem,
'duration': int(audio.duration_seconds),
'file': p.name,
})
# see https://assets.ctfassets.net/jtdj514wr91r/3khl5YaRusSuQ4a18amk38/8f35aecf398979cdfa6839ae29e79a46/Podcast_Delivery_Specification_v1.9.pdf
from feedgen.feed import FeedGenerator
from datetime import datetime, timezone
dt = datetime.now()
dt = dt.replace(tzinfo=timezone.utc)
fg = FeedGenerator()
fg.load_extension('podcast')
fg.title('East Boston Oral History Podcast')
fg.link( href='http://example.com', rel='alternate' )
fg.description('foo')
fg.podcast.itunes_author('Nat Taylor')
fg.podcast.itunes_category('History')
fg.podcast.itunes_type('episodic')
fg.podcast.itunes_image('http://ex.com/logo.jpg')
fg.language('en')
for i,e in enumerate(episodes):
fe = fg.add_entry()
fe.guid('123')
fe.title(e['name'])
fe.description('foo')
fe.pubDate(dt.isoformat())
fe.podcast.itunes_order(i)
fe.podcast.itunes_duration(e['duration'])
fe.enclosure(url=f"http://example.com/{e['file']}", length=e['duration'], type='audio/mpeg')
# fg.rss_file('podcast.xml')
print(fg.rss_str(pretty=True).decode('utf8'))