Nat TaylorBlog, AI, Product Management & Tinkering

Test Drive: Deepgram TTS

Published on .

Today I test drove Deepgram’s TTS offering since they give free credits. I’m making a podcast out of the East Boston Oral History that I mentioned yesterday. The process is:

  1. Get an API key
  2. Invoke

The code is very simple and returns decent audio.

from deepgram import DeepgramClient, SpeakOptions

deepgram = DeepgramClient()

def speak(text, filename):
    response = deepgram.speak.v("1").save(filename, {"text": text}, SpeakOptions(model="aura-arcas-en"))

I suppose making the podcast is much more interesting. Here is the code for that, as well as a sample.

So the steps are:

  1. Transcribe
  2. Generate intro and outro text based on the transcription
  3. Generate speech
  4. Dub it all together
  5. Generate the feed

If I were to do this again, I would have just used the OpenAI TTS.

import glob
import pydub
from pydantic import BaseModel
from openai import OpenAI
from pathlib import Path
from deepgram import DeepgramClient, SpeakOptions

client = OpenAI()

class PodcastText(BaseModel):
    intro: str
    outro: str

def generate_text(text):
    return client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are the podcast host of \"East Boston Oral History Podcast\" who writes intros and outros based on the transcript."},
            {"role": "user", "content": text}
        ],
        response_format=PodcastText,
    ).choices[0].message.parsed

deepgram = DeepgramClient()

def speak(text, filename):
    response = deepgram.speak.v("1").save(filename, {"text": text}, SpeakOptions(model="aura-arcas-en"))
    return response.filename

episodes = []
for f in glob.glob('/Users/ntaylor/Downloads/East Boston Oral History/*.mp3'):
    p = Path(f)
    result = mlx_whisper.transcribe(f, path_or_hf_repo="mlx-community/whisper-large-v3-turbo")
    text = generate_text(result['text'])
    intro = speak(text.intro, 'intro.mp3')
    outro = speak(text.outro, 'outro.mp3')
    audio = (
        pydub.AudioSegment.from_mp3("Greenway Groove.mp3")[:10000].fade_out(3000)
        .append(pydub.AudioSegment.from_mp3(intro))
        .append(pydub.AudioSegment.from_mp3(f)[:10000])
        .append(pydub.AudioSegment.from_mp3(outro))
    )
    audio.export(f'Episode {p.parts[-1]}', format="mp3")
    display(Audio(f'Episode {p.parts[-1]}'))
    episodes.append({
        'name': p.stem,
        'duration': int(audio.duration_seconds),
        'file': p.name,
    })


# see https://assets.ctfassets.net/jtdj514wr91r/3khl5YaRusSuQ4a18amk38/8f35aecf398979cdfa6839ae29e79a46/Podcast_Delivery_Specification_v1.9.pdf
from feedgen.feed import FeedGenerator
from datetime import datetime, timezone
dt = datetime.now()
dt = dt.replace(tzinfo=timezone.utc)

fg = FeedGenerator()
fg.load_extension('podcast')

fg.title('East Boston Oral History Podcast')
fg.link( href='http://example.com', rel='alternate' )
fg.description('foo')
fg.podcast.itunes_author('Nat Taylor')
fg.podcast.itunes_category('History')
fg.podcast.itunes_type('episodic')
fg.podcast.itunes_image('http://ex.com/logo.jpg')
fg.language('en')

for i,e in enumerate(episodes):
    fe = fg.add_entry()
    fe.guid('123')
    fe.title(e['name'])
    fe.description('foo')
    fe.pubDate(dt.isoformat())
    fe.podcast.itunes_order(i)
    fe.podcast.itunes_duration(e['duration'])
    fe.enclosure(url=f"http://example.com/{e['file']}", length=e['duration'], type='audio/mpeg')

# fg.rss_file('podcast.xml')
print(fg.rss_str(pretty=True).decode('utf8'))

Post Navigation

«
»