Skip to main content

Documentation Index

Fetch the complete documentation index at: https://assemblyai.com/docs/llms.txt

Use this file to discover all available pages before exploring further.


Streaming Diarization

Streaming Diarization lets you identify and label individual speakers in real time directly from the Streaming API. Each Turn event includes a speaker_label field (e.g. A, B) indicating the dominant speaker for that turn. Each final word in the words array also carries a speaker field, enabling mid-turn speaker change detection. Speaker accuracy improves over the course of a session as the model accumulates embedding context — so the longer the conversation, the better the labels.
Already using AssemblyAI streaming?You can enable Streaming Diarization by adding speaker_labels: true to your connection parameters. No other changes are required — the speaker_label field will appear on every Turn event, and each final word in the words array will include a speaker field automatically.

Quickstart

Get started with Streaming Diarization using the code below. This example streams audio from your microphone and prints each turn with its speaker label.
1
Install the required libraries
pip install websocket-client pyaudio
2
Create a new file main.py and paste the code below. Replace <YOUR_API_KEY> with your API key.
3
Run with python main.py and speak into your microphone.
import pyaudio
import websocket
import json
import threading
import time
from urllib.parse import urlencode

YOUR_API_KEY = "<YOUR_API_KEY>"
CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "speaker_labels": "true",
}
API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"

FRAMES_PER_BUFFER = 800
SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
CHANNELS = 1
FORMAT = pyaudio.paInt16

audio = None
stream = None
ws_app = None
audio_thread = None
stop_event = threading.Event()

def on_open(ws):
    print("WebSocket connection opened.")

    def stream_audio():
        global stream
        while not stop_event.is_set():
            try:
                audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
                ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
            except Exception as e:
                print(f"Error streaming audio: {e}")
                break

    global audio_thread
    audio_thread = threading.Thread(target=stream_audio)
    audio_thread.daemon = True
    audio_thread.start()

def on_message(ws, message):
    try:
        data = json.loads(message)
        msg_type = data.get("type")
        if msg_type == "Begin":
            print(f"Session began: ID={data.get('id')}")
        elif msg_type == "Turn":
            turn_speaker = data.get("speaker_label") or "UNKNOWN"
            end_of_turn = data.get("end_of_turn", False)
            if end_of_turn:
                words = data.get("words", [])
                segments, current_speaker, current_words = [], None, []
                for word in words:
                    spk = word["speaker"] if word.get("speaker") is not None else turn_speaker
                    if spk != current_speaker:
                        if current_words:
                            segments.append(f"[{current_speaker}] {' '.join(current_words)}")
                        current_speaker, current_words = spk, [word["text"]]
                    else:
                        current_words.append(word["text"])
                if current_words:
                    segments.append(f"[{current_speaker}] {' '.join(current_words)}")
                output = " ".join(segments) or f"[{turn_speaker}] {data.get('transcript', '')}"
                print(f"\r{' ' * 80}\r{output}")
            else:
                print(f"\r[{turn_speaker}] {data.get('transcript', '')}", end="")
        elif msg_type == "Termination":
            print(f"\nSession terminated: {data.get('audio_duration_seconds', 0)}s of audio")
    except Exception as e:
        print(f"Error handling message: {e}")

def on_error(ws, error):
    print(f"\nWebSocket Error: {error}")
    stop_event.set()

def on_close(ws, close_status_code, close_msg):
    print(f"\nWebSocket Disconnected: Status={close_status_code}")
    global stream, audio
    stop_event.set()
    if stream:
        if stream.is_active():
            stream.stop_stream()
        stream.close()
    if audio:
        audio.terminate()

def run():
    global audio, stream, ws_app
    audio = pyaudio.PyAudio()
    stream = audio.open(
        input=True,
        frames_per_buffer=FRAMES_PER_BUFFER,
        channels=CHANNELS,
        format=FORMAT,
        rate=SAMPLE_RATE,
    )
    print("Speak into your microphone. Press Ctrl+C to stop.")
    ws_app = websocket.WebSocketApp(
        API_ENDPOINT,
        header={"Authorization": YOUR_API_KEY},
        on_open=on_open,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
    )
    ws_thread = threading.Thread(target=ws_app.run_forever)
    ws_thread.daemon = True
    ws_thread.start()
    try:
        while ws_thread.is_alive():
            time.sleep(0.1)
    except KeyboardInterrupt:
        print("\nStopping...")
        stop_event.set()
        if ws_app and ws_app.sock and ws_app.sock.connected:
            ws_app.send(json.dumps({"type": "Terminate"}))
            time.sleep(2)
        if ws_app:
            ws_app.close()
        ws_thread.join(timeout=2.0)

if __name__ == "__main__":
    run()

Configuration

Enable Streaming Diarization by adding speaker_labels: true to your connection parameters. You can optionally cap the number of speakers with max_speakers.
ParameterTypeDefaultDescription
speaker_labelsbooleanfalseSet to true to enable real-time speaker diarization.
max_speakersintegerOptional. Hint the maximum number of speakers expected (1–10). Setting this accurately can improve assignment accuracy when you know the speaker count in advance.
{
  "speech_model": "u3-rt-pro",
  "speaker_labels": true,
  "max_speakers": 2
}
Diarization is supported on all streaming models: u3-rt-pro, universal-streaming-english, universal-streaming-multilingual, and whisper-rt. You do not need to change your speech model to use it — just add speaker_labels: true.

Reading speaker labels

When diarization is enabled, every Turn event includes a speaker_label field reflecting the dominant speaker for that turn.
{
  "type": "Turn",
  "transcript": "Good morning, thanks for joining the call.",
  "speaker_label": "A",
  "end_of_turn": true,
  "turn_is_formatted": true
}

Word-level speaker labels

Each final word in the words array also carries a speaker field. This allows you to detect speaker changes within a single turn — for example, a turn where one speaker finishes another’s sentence, or where a brief interjection appears mid-turn.
{
  "type": "Turn",
  "transcript": "Yeah. Different things the way they are said here than in contrast to the way they'd be said in other countries. Yeah. Your Colombian Spanish won't work. Uh, no. And she said she could— when I—",
  "speaker_label": "A",
  "end_of_turn": true,
  "words": [
    { "text": "Yeah.",      "speaker": "UNKNOWN", "word_is_final": true, "start":    0, "end":   96, "confidence": 0.204160 },
    { "text": "Different",  "speaker": "A",       "word_is_final": true, "start":  145, "end":  290, "confidence": 0.844642 },
    { "text": "things",     "speaker": "A",       "word_is_final": true, "start":  306, "end":  516, "confidence": 0.998971 },
    { "text": "the",        "speaker": "A",       "word_is_final": true, "start":  581, "end":  613, "confidence": 0.807043 },
    { "text": "way",        "speaker": "A",       "word_is_final": true, "start":  662, "end":  694, "confidence": 0.999722 },
    { "text": "they",       "speaker": "A",       "word_is_final": true, "start":  840, "end":  920, "confidence": 0.995030 },
    { "text": "are",        "speaker": "A",       "word_is_final": true, "start":  985, "end": 1017, "confidence": 0.867126 },
    { "text": "said",       "speaker": "A",       "word_is_final": true, "start": 1082, "end": 1260, "confidence": 0.990436 },
    { "text": "here",       "speaker": "A",       "word_is_final": true, "start": 1308, "end": 1502, "confidence": 0.999566 },
    { "text": "than",       "speaker": "A",       "word_is_final": true, "start": 1873, "end": 2003, "confidence": 0.618784 },
    { "text": "in",         "speaker": "A",       "word_is_final": true, "start": 2116, "end": 2229, "confidence": 0.955239 },
    { "text": "contrast",   "speaker": "A",       "word_is_final": true, "start": 2277, "end": 2568, "confidence": 0.998716 },
    { "text": "to",         "speaker": "A",       "word_is_final": true, "start": 2617, "end": 2714, "confidence": 0.992622 },
    { "text": "the",        "speaker": "A",       "word_is_final": true, "start": 2778, "end": 2810, "confidence": 0.996170 },
    { "text": "way",        "speaker": "A",       "word_is_final": true, "start": 2859, "end": 2956, "confidence": 0.999566 },
    { "text": "they'd",     "speaker": "A",       "word_is_final": true, "start": 3020, "end": 3214, "confidence": 0.844162 },
    { "text": "be",         "speaker": "A",       "word_is_final": true, "start": 3263, "end": 3295, "confidence": 0.998969 },
    { "text": "said",       "speaker": "A",       "word_is_final": true, "start": 3424, "end": 3602, "confidence": 0.994370 },
    { "text": "in",         "speaker": "A",       "word_is_final": true, "start": 3650, "end": 3683, "confidence": 0.999225 },
    { "text": "other",      "speaker": "A",       "word_is_final": true, "start": 3747, "end": 3861, "confidence": 0.999323 },
    { "text": "countries.", "speaker": "A",       "word_is_final": true, "start": 3974, "end": 4281, "confidence": 0.868172 },
    { "text": "Yeah.",      "speaker": "UNKNOWN", "word_is_final": true, "start": 4458, "end": 4636, "confidence": 0.656062 },
    { "text": "Your",       "speaker": "B",       "word_is_final": true, "start": 5040, "end": 5088, "confidence": 0.856220 },
    { "text": "Colombian",  "speaker": "B",       "word_is_final": true, "start": 5121, "end": 5638, "confidence": 0.962598 },
    { "text": "Spanish",    "speaker": "B",       "word_is_final": true, "start": 5638, "end": 6090, "confidence": 0.999557 },
    { "text": "won't",      "speaker": "B",       "word_is_final": true, "start": 6154, "end": 6284, "confidence": 0.999431 },
    { "text": "work.",      "speaker": "B",       "word_is_final": true, "start": 6332, "end": 6445, "confidence": 0.589761 },
    { "text": "Uh,",        "speaker": "A",       "word_is_final": true, "start": 6736, "end": 6752, "confidence": 0.343677 },
    { "text": "no.",        "speaker": "A",       "word_is_final": true, "start": 7673, "end": 7867, "confidence": 0.728975 },
    { "text": "And",        "speaker": "A",       "word_is_final": true, "start": 8820, "end": 8869, "confidence": 0.891464 },
    { "text": "she",        "speaker": "A",       "word_is_final": true, "start": 8901, "end": 9014, "confidence": 0.992945 },
    { "text": "said",       "speaker": "A",       "word_is_final": true, "start": 9079, "end": 9240, "confidence": 0.999052 },
    { "text": "she",        "speaker": "A",       "word_is_final": true, "start": 9305, "end": 9402, "confidence": 0.902266 },
    { "text": "could—",     "speaker": "A",       "word_is_final": true, "start": 9466, "end": 9579, "confidence": 0.605757 },
    { "text": "when",       "speaker": "A",       "word_is_final": true, "start": 9644, "end": 9757, "confidence": 0.827706 },
    { "text": "I—",         "speaker": "A",       "word_is_final": true, "start": 9870, "end": 9886, "confidence": 0.437470 }
  ]
}
A few things to keep in mind when consuming speaker:
  • Final words only. The speaker field only appears on words where word_is_final: true. Non-final (in-progress) words never carry it.
  • speaker can be absent on individual words. If the field is missing from a word entirely, treat that word as unattributed and fall back to the turn-level speaker_label if you need a label. Absent means the field is omitted from the JSON — it will never be null.
  • UNKNOWN at word level means the model couldn’t confidently attribute that word to any specific speaker — common for short backchannels (“uh huh”, “yeah”) or brief low-quality audio segments. It is not an ambiguity flag between two known speakers; words in a confidently-attributed stretch carry the speaker’s letter, not UNKNOWN.
If a turn contains less than approximately 1 second of audio, the turn-level speaker_label will be set to "UNKNOWN". This is because the model needs at least ~1 second of audio to generate a reliable diarization embedding — without enough audio, embeddings may be inaccurate and could lead to a single speaker being labeled as multiple speakers. Labeling short turns as "UNKNOWN" ensures that speaker labels remain as accurate as possible.
{
  "type": "Turn",
  "transcript": "Hello?",
  "speaker_label": "UNKNOWN",
  "end_of_turn": true,
  "turn_is_formatted": true
}
Your application should handle this case gracefully. A typical multi-speaker exchange looks like this:
[A] Good morning, thanks for joining the call.
[B] Good morning. Happy to be here.
[A] So let's start with a quick overview of the project timeline.
[B] Sure. We're currently on track for the March deadline.
[A] Great. And how's the team handling the workload?
[C] It's been busy, but manageable. We brought on two new engineers last week.

How speaker accuracy improves over time

Streaming Diarization builds a speaker profile incrementally as audio flows in. In practice this means:
  • Early in a session, speaker assignments may be less stable, especially if the first few turns are short.
  • As the session progresses, the model accumulates richer speaker embeddings and assignments become more consistent.
For long-form use cases (call center, clinical scribe, meeting transcription), the model will settle into accurate, stable labels well before the end of the conversation.

Known limitations

Real-time diarization is an inherently harder problem than diarization for async transcription on pre-recorded audio. The following limitations apply to the current beta:
  • Short utterances — Turns with less than ~1 second of audio are labeled as "UNKNOWN" because there is insufficient audio to generate a reliable speaker embedding. This prevents inaccurate embeddings from causing a single speaker to be split across multiple labels.
  • Overlapping speech — When two speakers talk simultaneously, the model cannot split the audio and will assign the turn to a single speaker. Performance degrades with frequent cross-talk.
  • Session start accuracy — The first 1–2 turns of a session may be misassigned because the model has not yet built up speaker profiles. This self-corrects quickly in practice.
  • Noisy environments — Background noise and microphone bleed between speakers can reduce embedding quality and lead to more frequent misassignments.
For the best results, use a microphone setup that minimizes cross-talk and background noise, and ensure each speaker produces at least a few complete sentences before you rely on per-turn labels for downstream processing.

Supported models

Modelspeech_model valueDiarization supported
Universal-3 Pro Streamingu3-rt-pro
Universal Streaming (English)universal-streaming-english
Universal Streaming (Multilingual)universal-streaming-multilingual
Whisper Streamingwhisper-rt

Multichannel streaming audio

To transcribe multichannel streaming audio, we recommend creating a separate session for each channel. This approach allows you to maintain clear speaker separation and get accurate diarized transcriptions for conversations, phone calls, or interviews where speakers are recorded on two different channels. The following code example demonstrates how to transcribe a dual-channel audio file with diarized, speaker-separated transcripts. This same approach can be applied to any multi-channel audio stream, including those with more than two channels.
1
Firstly, install the required dependencies.
pip install websocket-client numpy pyaudio
2
Use this complete script to transcribe dual-channel audio with speaker separation:
import websocket
import json
import threading
import numpy as np
import wave
import time
import pyaudio
from urllib.parse import urlencode

# Configuration
YOUR_API_KEY = "<YOUR_API_KEY>"
AUDIO_FILE_PATH = "<DUAL_CHANNEL_AUDIO_FILE_PATH>"
API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
API_PARAMS = {
    "sample_rate": 8000,
    "format_turns": "true",
    "end_of_turn_confidence_threshold": 0.4,
    "min_turn_silence": 160,
    "max_turn_silence": 400,
}
# Build API endpoint with URL encoding
API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}"

class ChannelTranscriber:
    def __init__(self, channel_id, channel_name):
        self.channel_id = channel_id
        self.channel_name = channel_name
        self.ws_app = None
        self.audio_data = []
        self.current_turn_line = None
        self.line_count = 0

    def load_audio_channel(self):
        """Extract single channel from dual-channel audio file."""
        with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
            frames = wf.readframes(wf.getnframes())
            audio_array = np.frombuffer(frames, dtype=np.int16)

            if wf.getnchannels() == 2:
                audio_array = audio_array.reshape(-1, 2)
                channel_audio = audio_array[:, self.channel_id]

                # Split into chunks for streaming
                FRAMES_PER_BUFFER = 400  # 50ms chunks
                for i in range(0, len(channel_audio), FRAMES_PER_BUFFER):
                    chunk = channel_audio[i:i+FRAMES_PER_BUFFER]
                    if len(chunk) < FRAMES_PER_BUFFER:
                        chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant')
                    self.audio_data.append(chunk.astype(np.int16).tobytes())

    def on_open(self, ws):
        """Stream audio data when connection opens."""
        def stream_audio():
            for chunk in self.audio_data:
                ws.send(chunk, websocket.ABNF.OPCODE_BINARY)
                time.sleep(0.05)  # 50ms intervals

            # Send termination message
            terminate_message = {"type": "Terminate"}
            ws.send(json.dumps(terminate_message))

        threading.Thread(target=stream_audio, daemon=True).start()

    def clear_current_line(self):
        if self.current_turn_line is not None:
            print("\r" + " " * 100 + "\r", end="", flush=True)

    def print_partial_transcript(self, words):
        self.clear_current_line()
        # Build transcript from individual words
        word_texts = [word.get('text', '') for word in words]
        transcript = ' '.join(word_texts)
        partial_text = f"{self.channel_name}: {transcript}"
        print(partial_text, end="", flush=True)
        self.current_turn_line = len(partial_text)

    def print_final_transcript(self, transcript):
        self.clear_current_line()
        final_text = f"{self.channel_name}: {transcript}"
        print(final_text, flush=True)
        self.current_turn_line = None
        self.line_count += 1

    def on_message(self, ws, message):
        """Handle transcription results."""
        data = json.loads(message)
        msg_type = data.get('type')

        if msg_type == "Turn":
            transcript = data.get('transcript', '').strip()
            words = data.get('words', [])

            if transcript or words:
                if data.get('end_of_turn'):
                    self.print_final_transcript(transcript)
                else:
                    self.print_partial_transcript(words)

    def start_transcription(self):
        self.load_audio_channel()

        self.ws_app = websocket.WebSocketApp(
            API_ENDPOINT,
            header={"Authorization": YOUR_API_KEY},
            on_open=self.on_open,
            on_message=self.on_message,
        )

        thread = threading.Thread(target=self.ws_app.run_forever, daemon=True)
        thread.start()
        return thread

def play_audio_file():
    try:
        with wave.open(AUDIO_FILE_PATH, 'rb') as wf:
            p = pyaudio.PyAudio()

            stream = p.open(
                format=p.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True
            )

            print(f"Playing audio: {AUDIO_FILE_PATH}")

            # Play audio in chunks
            chunk_size = 1024
            data = wf.readframes(chunk_size)

            while data:
                stream.write(data)
                data = wf.readframes(chunk_size)

            stream.stop_stream()
            stream.close()
            p.terminate()

            print("Audio playback finished")

    except Exception as e:
        print(f"Error playing audio: {e}")


def transcribe_multichannel():
    # Create transcribers for each channel
    transcriber_1 = ChannelTranscriber(0, "Speaker 1")
    transcriber_2 = ChannelTranscriber(1, "Speaker 2")

    # Start audio playback
    audio_thread = threading.Thread(target=play_audio_file, daemon=True)
    audio_thread.start()

    # Start both transcriptions
    thread_1 = transcriber_1.start_transcription()
    thread_2 = transcriber_2.start_transcription()

    # Wait for completion
    thread_1.join()
    thread_2.join()
    audio_thread.join()

if __name__ == "__main__":
    transcribe_multichannel()
Configure turn detection for your use caseThe examples above use turn detection settings optimized for short responses and rapid back-and-forth conversations. To optimize for your specific audio scenario, you can adjust the turn detection parameters.For configuration examples tailored to different use cases, refer to our Configuration examples.
Modify the turn detection parameters in API_PARAMS:
API_PARAMS = {
    "sample_rate": 8000,
    "format_turns": "true",
    "end_of_turn_confidence_threshold": 0.4,
    "min_turn_silence": 160,
    "max_turn_silence": 400,
}