Skip to main content

Documentation Index

Fetch the complete documentation index at: https://assemblyai.com/docs/llms.txt

Use this file to discover all available pages before exploring further.



US & EU
Medical Mode is an add-on that enhances streaming transcription accuracy for medical terminology — including medication names, procedures, conditions, and dosages. It is optimized for medical entity recognition to correct terms that other models frequently get wrong. Medical Mode can be used with all of our Streaming STT models. Enable Medical Mode by setting the domain connection parameter to "medical-v1". No other changes to your existing pipeline are required.
Medical Mode is billed as a separate add-on. See the pricing page for details.

Quickstart

pip install websocket-client pyaudio
import pyaudio
import websocket
import json
import threading
import time
from urllib.parse import urlencode
from datetime import datetime

# --- Configuration ---
YOUR_API_KEY = "YOUR-API-KEY"  # Replace with your actual API key

CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1"
}
API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"

# Audio Configuration
FRAMES_PER_BUFFER = 800  # 50ms of audio (0.05s * 16000Hz)
SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
CHANNELS = 1
FORMAT = pyaudio.paInt16

# Global variables for audio stream and websocket
audio = None
stream = None
ws_app = None
audio_thread = None
stop_event = threading.Event()  # To signal the audio thread to stop

# --- WebSocket Event Handlers ---


def on_open(ws):
    """Called when the WebSocket connection is established."""
    print("WebSocket connection opened.")
    print(f"Connected to: {API_ENDPOINT}")

    # Start sending audio data in a separate thread
    def stream_audio():
        global stream
        print("Starting audio streaming...")
        while not stop_event.is_set():
            try:
                audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)

                # Send audio data as binary message
                ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
            except Exception as e:
                print(f"Error streaming audio: {e}")
                break
        print("Audio streaming stopped.")

    global audio_thread
    audio_thread = threading.Thread(target=stream_audio)
    audio_thread.daemon = True
    audio_thread.start()

def on_message(ws, message):
    try:
        data = json.loads(message)
        msg_type = data.get('type')

        if msg_type == "Begin":
            session_id = data.get('id')
            expires_at = data.get('expires_at')
            print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}")
        elif msg_type == "Turn":
            transcript = data.get('transcript', '')
            formatted = data.get('turn_is_formatted', False)

            if formatted:
                print('\r' + ' ' * 80 + '\r', end='')
                print(transcript)
            else:
                print(f"\r{transcript}", end='')
        elif msg_type == "Termination":
            audio_duration = data.get('audio_duration_seconds', 0)
            session_duration = data.get('session_duration_seconds', 0)
            print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s")
    except json.JSONDecodeError as e:
        print(f"Error decoding message: {e}")
    except Exception as e:
        print(f"Error handling message: {e}")

def on_error(ws, error):
    """Called when a WebSocket error occurs."""
    print(f"\nWebSocket Error: {error}")
    stop_event.set()


def on_close(ws, close_status_code, close_msg):
    """Called when the WebSocket connection is closed."""
    print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")

    global stream, audio
    stop_event.set()

    if stream:
        if stream.is_active():
            stream.stop_stream()
        stream.close()
        stream = None
    if audio:
        audio.terminate()
        audio = None
    if audio_thread and audio_thread.is_alive():
        audio_thread.join(timeout=1.0)


# --- Main Execution ---
def run():
    global audio, stream, ws_app

    audio = pyaudio.PyAudio()

    try:
        stream = audio.open(
            input=True,
            frames_per_buffer=FRAMES_PER_BUFFER,
            channels=CHANNELS,
            format=FORMAT,
            rate=SAMPLE_RATE,
        )
        print("Microphone stream opened successfully.")
        print("Speak into your microphone. Press Ctrl+C to stop.")
    except Exception as e:
        print(f"Error opening microphone stream: {e}")
        if audio:
            audio.terminate()
        return

    ws_app = websocket.WebSocketApp(
        API_ENDPOINT,
        header={"Authorization": YOUR_API_KEY},
        on_open=on_open,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
    )

    ws_thread = threading.Thread(target=ws_app.run_forever)
    ws_thread.daemon = True
    ws_thread.start()

    try:
        while ws_thread.is_alive():
            time.sleep(0.1)
    except KeyboardInterrupt:
        print("\nCtrl+C received. Stopping...")
        stop_event.set()

        if ws_app and ws_app.sock and ws_app.sock.connected:
            try:
                terminate_message = {"type": "Terminate"}
                print(f"Sending termination message: {json.dumps(terminate_message)}")
                ws_app.send(json.dumps(terminate_message))
                time.sleep(5)
            except Exception as e:
                print(f"Error sending termination message: {e}")

        if ws_app:
            ws_app.close()

        ws_thread.join(timeout=2.0)

    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        stop_event.set()
        if ws_app:
            ws_app.close()
        ws_thread.join(timeout=2.0)

    finally:
        if stream and stream.is_active():
            stream.stop_stream()
        if stream:
            stream.close()
        if audio:
            audio.terminate()
        print("Cleanup complete. Exiting.")


if __name__ == "__main__":
    run()

Example output

Without Medical Mode:
I have here insulin to be used for both prandial mealtime and sliding scale is
insulin lisprohumalog subcutaneously.
With Medical Mode, lisprohumalog is updated to Lispro (Humalog) - following the standard medical convention of writing the generic name first, with the brand name in parentheses.
I have here insulin to be used for both prandial mealtime and sliding scale is
insulin Lispro (Humalog) subcutaneously.

Use cases

Medical Mode is designed for healthcare AI applications where accurate medical terminology is critical:
  • Ambient clinical documentation — Capture medication names, dosages, and clinical terms correctly during live patient encounters.
  • Real-time medical scribes — Deliver accurate transcripts to clinicians during or immediately after a consult.
  • Front-office voice agents — Handle drug names, provider names, and clinic-specific terminology in scheduling calls and insurance verification.
  • Medical contact centers — Transcribe calls with correct medical vocabulary for downstream processing and quality assurance.

Combine with other features

Medical Mode works alongside other streaming features. You can combine it with:
CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1",
    "speaker_labels": "true",
    "keyterms_prompt": json.dumps(["Lisinopril", "Metformin", "Humalog"])
}

Configuration for medical audio

Medical conversations — such as clinical dictation, patient encounters, and ambient scribes — have different speech patterns than typical voice agent interactions. Clinicians often pause mid-sentence to think, review a chart, or formulate a diagnosis. The default turn detection settings are optimized for fast-paced voice agent dialogues and can incorrectly fragment these natural pauses into separate turns. To prevent premature turn boundaries in medical audio, increase the silence thresholds:
const streamingConfig = {
  min_turn_silence: 800,
  max_turn_silence: 3600,
};
ParameterDefaultRecommended for MedicalWhy
min_turn_silence100 ms (U3 Pro) / 400 ms (Universal Streaming)800 msGives clinicians time to pause mid-sentence without triggering a speculative end-of-turn check.
max_turn_silence1000 ms (U3 Pro) / 1280 ms (Universal Streaming)3600 msAllows extended pauses for chart review or thinking without forcing a turn boundary.
These values match the Conservative quick start configuration on the turn detection page. You can further adjust them based on your specific workflow — for example, a real-time medical scribe may benefit from a lower max_turn_silence (around 2000 ms) than a dictation application.
CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1",
    "min_turn_silence": 800,
    "max_turn_silence": 3600,
}
Avoid setting end_of_turn_confidence_threshold to 0If you are using a Universal Streaming model (not U3 Pro), do not set end_of_turn_confidence_threshold to 0. This completely disables semantic turn detection and forces a turn boundary at every silence, which is especially harmful for medical audio where mid-sentence pauses are common. See Turn detection for details.

HIPAA compliance

AssemblyAI offers a Business Associate Agreement (BAA) for customers who need to process Protected Health Information (PHI). AssemblyAI is SOC 2 Type 2, ISO 27001:2022, and PCI DSS v4.0 certified. Medical Mode does not change existing data handling or retention policies. For BAA setup or enterprise pricing, contact our sales team.