A Coding Implementation on Deepgram Python SDK for Transcription, Text-to-Speech, Async Audio Processing, and Text Intelligence
In this tutorial, we build an advanced hands-on workflow with the Deepgram Python SDK and explore how modern voice AI capabilities come together in a single Python environment. We set up authentication, connect both synchronous and asynchronous Deepgram clients, and work directly with real audio data to understand how the SDK handles transcription, speech generation, and text analysis in practice. We transcribe audio from both a URL and a local file, inspect confidence scores, word-level timestamps, speaker diarization, paragraph formatting, and AI-generated summaries, and then extend the pipeline to async processing for faster, more scalable execution. We also generate speech with multiple TTS voices, analyze text for sentiment, topics, and intents, and examine advanced transcription controls such as keyword search, replacement, boosting, raw response access, and structured error handling. Through this process, we create a practical, end-to-end Deepgram voice AI workflow that is both technically detailed and easy to adapt for real-world applications. Copy CodeCopiedUse a different Browser !pip install deepgram-sdk httpx –quiet import os, asyncio, textwrap, urllib.request from getpass import getpass from deepgram import DeepgramClient, AsyncDeepgramClient from deepgram.core.api_error import ApiError from IPython.display import Audio, display DEEPGRAM_API_KEY = getpass(” Enter your Deepgram API key: “) os.environ[“DEEPGRAM_API_KEY”] = DEEPGRAM_API_KEY client = DeepgramClient(api_key=DEEPGRAM_API_KEY) async_client = AsyncDeepgramClient(api_key=DEEPGRAM_API_KEY) AUDIO_URL = “https://dpgr.am/spacewalk.wav” AUDIO_PATH = “/tmp/sample.wav” urllib.request.urlretrieve(AUDIO_URL, AUDIO_PATH) def read_audio(path=AUDIO_PATH): with open(path, “rb”) as f: return f.read() def _get(obj, key, default=None): “””Get a field from either a dict or an object — v6 returns both.””” if isinstance(obj, dict): return obj.get(key, default) return getattr(obj, key, default) def get_model_name(meta): mi = _get(meta, “model_info”) if mi is None: return “n/a” return _get(mi, “name”, “n/a”) def tts_to_bytes(response) -> bytes: “””v6 generate() returns a generator of chunks or an object with .stream.””” if hasattr(response, “stream”): return response.stream.getvalue() return b””.join(chunk for chunk in response if isinstance(chunk, bytes)) def save_tts(response, path: str) -> str: with open(path, “wb”) as f: f.write(tts_to_bytes(response)) return path print(” Deepgram client ready | sample audio downloaded”) print(“n” + “=”*60) print(” SECTION 2: Pre-Recorded Transcription from URL”) print(“=”*60) response = client.listen.v1.media.transcribe_url( url=AUDIO_URL, model=”nova-3″, smart_format=True, diarize=True, language=”en”, utterances=True, filler_words=True, ) transcript = response.results.channels[0].alternatives[0].transcript print(f”n Full Transcript:n{textwrap.fill(transcript, 80)}”) confidence = response.results.channels[0].alternatives[0].confidence print(f”n Confidence: {confidence:.2%}”) words = response.results.channels[0].alternatives[0].words print(f”n First 5 words with timing:”) for w in words[:5]: print(f” ‘{w.word}’ start={w.start:.2f}s end={w.end:.2f}s conf={w.confidence:.2f}”) print(f”n Speaker Diarization (first 5 words):”) for w in words[:5]: speaker = getattr(w, “speaker”, None) if speaker is not None: print(f” Speaker {int(speaker)}: ‘{w.word}'”) meta = response.metadata print(f”n Metadata: duration={meta.duration:.2f}s channels={int(meta.channels)} model={get_model_name(meta)}”) We install the Deepgram SDK and its dependencies, then securely set up authentication using our API key. We initialize both synchronous and asynchronous Deepgram clients, download a sample audio file, and define helper functions to make it easier to work with mixed response objects, audio bytes, model metadata, and streamed TTS outputs. We then run our first pre-recorded transcription from a URL and inspect the transcript, confidence score, word-level timestamps, speaker diarization, and metadata to understand the structure and richness of the response. Copy CodeCopiedUse a different Browser print(“n” + “=”*60) print(” SECTION 3: Pre-Recorded Transcription from File”) print(“=”*60) file_response = client.listen.v1.media.transcribe_file( request=read_audio(), model=”nova-3″, smart_format=True, diarize=True, paragraphs=True, summarize=”v2”, ) alt = file_response.results.channels[0].alternatives[0] paragraphs = getattr(alt, “paragraphs”, None) if paragraphs and _get(paragraphs, “paragraphs”): print(“n Paragraph-Formatted Transcript:”) for para in _get(paragraphs, “paragraphs”)[:2]: sentences = ” “.join(_get(s, “text”, “”) for s in (_get(para, “sentences”) or [])) print(f” [Speaker {int(_get(para,’speaker’,0))}, ” f”{_get(para,’start’,0):.1f}s–{_get(para,’end’,0):.1f}s] {sentences[:120]}…”) else: print(f”n Transcript: {alt.transcript[:200]}…”) if getattr(file_response.results, “summary”, None): short = _get(file_response.results.summary, “short”, “”) if short: print(f”n AI Summary: {short}”) print(f”n Confidence: {alt.confidence:.2%}”) print(f” Word count : {len(alt.words)}”) print(“n” + “=”*60) print(” SECTION 4: Async Parallel Transcription”) print(“=”*60) async def transcribe_async(): audio_bytes = read_audio() async def from_url(label): r = await async_client.listen.v1.media.transcribe_url( url=AUDIO_URL, model=”nova-3″, smart_format=True, ) print(f” [{label}] {r.results.channels[0].alternatives[0].transcript[:100]}…”) async def from_file(label): r = await async_client.listen.v1.media.transcribe_file( request=audio_bytes, model=”nova-3″, smart_format=True, ) print(f” [{label}] {r.results.channels[0].alternatives[0].transcript[:100]}…”) await asyncio.gather(from_url(“From URL”), from_file(“From File”)) await transcribe_async() We move from URL-based to file-based transcription by sending raw audio bytes directly to the Deepgram API, enabling richer options such as paragraphs and summarization. We inspect the returned paragraph structure, speaker segmentation, summary output, confidence score, and word count to see how the SDK supports more readable and analysis-friendly transcription results. We also introduce asynchronous processing and run URL-based and file-based transcription in parallel, helping us understand how to build faster, more scalable voice AI pipelines. Copy CodeCopiedUse a different Browser print(“n” + “=”*60) print(” SECTION 5: Text-to-Speech”) print(“=”*60) sample_text = ( “Welcome to the Deepgram advanced tutorial. ” “This SDK lets you transcribe audio, generate speech, ” “and analyse text — all with a simple Python interface.” ) tts_path = save_tts( client.speak.v1.audio.generate(text=sample_text, model=”aura-2-asteria-en”), “/tmp/tts_output.mp3″, ) size_kb = os.path.getsize(tts_path) / 1024 print(f” TTS audio saved → {tts_path} ({size_kb:.1f} KB)”) display(Audio(tts_path)) print(“n” + “=”*60) print(” SECTION 6: Multiple TTS Voices Comparison”) print(“=”*60) voices = { “aura-2-asteria-en”: “Asteria (female, warm)”, “aura-2-orion-en”: “Orion (male, deep)”, “aura-2-luna-en”: “Luna (female, bright)”, } for model_id, label in voices.items(): try: path = save_tts( client.speak.v1.audio.generate(text=”Hello! I am a Deepgram voice model.”, model=model_id), f”/tmp/tts_{model_id}.mp3″, ) print(f” {label}”) display(Audio(path)) except Exception as e: print(f” {label} — {e}”) print(“n” + “=”*60) print(” SECTION 7: Text Intelligence — Sentiment, Topics, Intents”) print(“=”*60) review_text = ( “I absolutely love this product! It arrived quickly, the quality is ” “outstanding, and customer support was incredibly helpful when I had ” “a question. I would definitely recommend it to anyone looking for ” “a reliable solution. Five stars!” ) read_response = client.read.v1.text.analyze( request={“text”: review_text}, language=”en”, sentiment=True, topics=True, intents=True, summarize=True, ) results = read_response.results We focus on speech generation by converting text to audio using Deepgram’s text-to-speech API and saving the resulting audio as an MP3 file. We then compare multiple TTS voices to hear how different voice models behave and how easily we can switch between them while keeping the same code pattern. After that, we begin working with the Read API by passing the review text into Deepgram’s text intelligence system to analyze language beyond simple transcription. Copy CodeCopiedUse a different Browser if getattr(results, “sentiments”, None): overall = results.sentiments.average print(f” Sentiment: {_get(overall,’sentiment’,’?’).upper()} ” f”(score={_get(overall,’sentiment_score’,0):.3f})”) for seg in (_get(results.sentiments, “segments”) or [])[:2]: print(f” •



