Medical Mode - AssemblyAI

Supported models

Supported languages

Supported regions

US & EU

Medical Mode is an add-on that enhances streaming transcription accuracy for medical terminology — including medication names, procedures, conditions, and dosages. It is optimized for medical entity recognition to correct terms that other models frequently get wrong. Medical Mode can be used with all of our Streaming STT models. Enable Medical Mode by setting the domain connection parameter to "medical-v1". No other changes to your existing pipeline are required.

Medical Mode is billed as a separate add-on. See the pricing page for details.

Quickstart

Python
Python SDK
Javascript
JavaScript SDK

pip install websocket-client pyaudio

pip install assemblyai

npm install ws mic

npm install assemblyai node-record-lpcm16

The module node-record-lpcm16 requires SoX and it must be available in your $PATH.For Mac OS:

brew install sox

For most linux disto’s:

sudo apt-get install sox libsox-fmt-all

For Windows:download the binaries

Python
Python SDK
Javascript
JavaScript SDK

import pyaudio
import websocket
import json
import threading
import time
from urllib.parse import urlencode
from datetime import datetime

# --- Configuration ---
YOUR_API_KEY = "YOUR-API-KEY"  # Replace with your actual API key

CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1"
}
API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"

# Audio Configuration
FRAMES_PER_BUFFER = 800  # 50ms of audio (0.05s * 16000Hz)
SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"]
CHANNELS = 1
FORMAT = pyaudio.paInt16

# Global variables for audio stream and websocket
audio = None
stream = None
ws_app = None
audio_thread = None
stop_event = threading.Event()  # To signal the audio thread to stop

# --- WebSocket Event Handlers ---


def on_open(ws):
    """Called when the WebSocket connection is established."""
    print("WebSocket connection opened.")
    print(f"Connected to: {API_ENDPOINT}")

    # Start sending audio data in a separate thread
    def stream_audio():
        global stream
        print("Starting audio streaming...")
        while not stop_event.is_set():
            try:
                audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)

                # Send audio data as binary message
                ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
            except Exception as e:
                print(f"Error streaming audio: {e}")
                break
        print("Audio streaming stopped.")

    global audio_thread
    audio_thread = threading.Thread(target=stream_audio)
    audio_thread.daemon = True
    audio_thread.start()

def on_message(ws, message):
    try:
        data = json.loads(message)
        msg_type = data.get('type')

        if msg_type == "Begin":
            session_id = data.get('id')
            expires_at = data.get('expires_at')
            print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}")
        elif msg_type == "Turn":
            transcript = data.get('transcript', '')
            formatted = data.get('turn_is_formatted', False)

            if formatted:
                print('\r' + ' ' * 80 + '\r', end='')
                print(transcript)
            else:
                print(f"\r{transcript}", end='')
        elif msg_type == "Termination":
            audio_duration = data.get('audio_duration_seconds', 0)
            session_duration = data.get('session_duration_seconds', 0)
            print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s")
    except json.JSONDecodeError as e:
        print(f"Error decoding message: {e}")
    except Exception as e:
        print(f"Error handling message: {e}")

def on_error(ws, error):
    """Called when a WebSocket error occurs."""
    print(f"\nWebSocket Error: {error}")
    stop_event.set()


def on_close(ws, close_status_code, close_msg):
    """Called when the WebSocket connection is closed."""
    print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")

    global stream, audio
    stop_event.set()

    if stream:
        if stream.is_active():
            stream.stop_stream()
        stream.close()
        stream = None
    if audio:
        audio.terminate()
        audio = None
    if audio_thread and audio_thread.is_alive():
        audio_thread.join(timeout=1.0)


# --- Main Execution ---
def run():
    global audio, stream, ws_app

    audio = pyaudio.PyAudio()

    try:
        stream = audio.open(
            input=True,
            frames_per_buffer=FRAMES_PER_BUFFER,
            channels=CHANNELS,
            format=FORMAT,
            rate=SAMPLE_RATE,
        )
        print("Microphone stream opened successfully.")
        print("Speak into your microphone. Press Ctrl+C to stop.")
    except Exception as e:
        print(f"Error opening microphone stream: {e}")
        if audio:
            audio.terminate()
        return

    ws_app = websocket.WebSocketApp(
        API_ENDPOINT,
        header={"Authorization": YOUR_API_KEY},
        on_open=on_open,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
    )

    ws_thread = threading.Thread(target=ws_app.run_forever)
    ws_thread.daemon = True
    ws_thread.start()

    try:
        while ws_thread.is_alive():
            time.sleep(0.1)
    except KeyboardInterrupt:
        print("\nCtrl+C received. Stopping...")
        stop_event.set()

        if ws_app and ws_app.sock and ws_app.sock.connected:
            try:
                terminate_message = {"type": "Terminate"}
                print(f"Sending termination message: {json.dumps(terminate_message)}")
                ws_app.send(json.dumps(terminate_message))
                time.sleep(5)
            except Exception as e:
                print(f"Error sending termination message: {e}")

        if ws_app:
            ws_app.close()

        ws_thread.join(timeout=2.0)

    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        stop_event.set()
        if ws_app:
            ws_app.close()
        ws_thread.join(timeout=2.0)

    finally:
        if stream and stream.is_active():
            stream.stop_stream()
        if stream:
            stream.close()
        if audio:
            audio.terminate()
        print("Cleanup complete. Exiting.")


if __name__ == "__main__":
    run()

import logging
from typing import Type

import assemblyai as aai
from assemblyai.streaming.v3 import (
    BeginEvent,
    StreamingClient,
    StreamingClientOptions,
    StreamingError,
    StreamingEvents,
    StreamingParameters,
    TerminationEvent,
    TurnEvent,
)

api_key = "<YOUR_API_KEY>"

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def on_begin(self: Type[StreamingClient], event: BeginEvent):
    print(f"Session started: {event.id}")


def on_turn(self: Type[StreamingClient], event: TurnEvent):
    if event.turn_is_formatted:
        print(f"\r{' ' * 100}\r{event.transcript}")
    else:
        print(f"\r{event.transcript}", end='', flush=True)


def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
    print(
        f"Session terminated: {event.audio_duration_seconds} seconds of audio processed"
    )


def on_error(self: Type[StreamingClient], error: StreamingError):
    print(f"Error occurred: {error}")


def main():
    client = StreamingClient(
        StreamingClientOptions(
            api_key=api_key,
            api_host="streaming.assemblyai.com",
        )
    )

    client.on(StreamingEvents.Begin, on_begin)
    client.on(StreamingEvents.Turn, on_turn)
    client.on(StreamingEvents.Termination, on_terminated)
    client.on(StreamingEvents.Error, on_error)

    client.connect(
        StreamingParameters(
            sample_rate=16000,
            speech_model="u3-rt-pro",
            domain="medical-v1",
        )
    )

    try:
        client.stream(
          aai.extras.MicrophoneStream(sample_rate=16000)
        )
    finally:
        client.disconnect(terminate=True)


if __name__ == "__main__":
    main()

const WebSocket = require("ws");
const mic = require("mic");
const querystring = require("querystring");

// --- Configuration ---
const YOUR_API_KEY = "YOUR-API-KEY"; // Replace with your actual API key
const CONNECTION_PARAMS = {
  sample_rate: 16000,
  speech_model: "u3-rt-pro",
  domain: "medical-v1",
};
const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws";
const API_ENDPOINT = `${API_ENDPOINT_BASE_URL}?${querystring.stringify(CONNECTION_PARAMS)}`;

// Audio Configuration
const SAMPLE_RATE = CONNECTION_PARAMS.sample_rate;
const CHANNELS = 1;

// Global variables
let micInstance = null;
let micInputStream = null;
let ws = null;
let stopRequested = false;

// --- Helper functions ---
function formatTimestamp(timestamp) {
  return new Date(timestamp * 1000).toISOString();
}

// --- Main function ---
async function run() {
  console.log("Starting AssemblyAI real-time transcription...");

  ws = new WebSocket(API_ENDPOINT, {
    headers: {
      Authorization: YOUR_API_KEY,
    },
  });

  ws.on("open", () => {
    console.log("WebSocket connection opened.");
    console.log(`Connected to: ${API_ENDPOINT}`);
    startMicrophone();
  });

  ws.on("message", (message) => {
    try {
      const data = JSON.parse(message);
      const msgType = data.type;

      if (msgType === "Begin") {
        const sessionId = data.id;
        const expiresAt = data.expires_at;
        console.log(
          `\nSession began: ID=${sessionId}, ExpiresAt=${formatTimestamp(expiresAt)}`
        );
      } else if (msgType === "Turn") {
        const transcript = data.transcript || "";
        const formatted = data.turn_is_formatted;

        if (formatted) {
          process.stdout.write("\r" + " ".repeat(100) + "\r");
          console.log(transcript);
        } else {
          process.stdout.write(`\r${transcript}`);
        }
      } else if (msgType === "Termination") {
        const audioDuration = data.audio_duration_seconds;
        const sessionDuration = data.session_duration_seconds;
        console.log(
          `\nSession Terminated: Audio Duration=${audioDuration}s, Session Duration=${sessionDuration}s`
        );
      }
    } catch (error) {
      console.error(`\nError handling message: ${error}`);
      console.error(`Message data: ${message}`);
    }
  });

  ws.on("error", (error) => {
    console.error(`\nWebSocket Error: ${error}`);
    cleanup();
  });

  ws.on("close", (code, reason) => {
    console.log(`\nWebSocket Disconnected: Status=${code}, Msg=${reason}`);
    cleanup();
  });

  setupTerminationHandlers();
}

function startMicrophone() {
  try {
    micInstance = mic({
      rate: SAMPLE_RATE.toString(),
      channels: CHANNELS.toString(),
      debug: false,
      exitOnSilence: 6,
    });

    micInputStream = micInstance.getAudioStream();

    micInputStream.on("data", (data) => {
      if (ws && ws.readyState === WebSocket.OPEN && !stopRequested) {
        ws.send(data);
      }
    });

    micInputStream.on("error", (err) => {
      console.error(`Microphone Error: ${err}`);
      cleanup();
    });

    micInstance.start();
    console.log("Microphone stream opened successfully.");
    console.log("Speak into your microphone. Press Ctrl+C to stop.");
  } catch (error) {
    console.error(`Error opening microphone stream: ${error}`);
    cleanup();
  }
}

function cleanup() {
  stopRequested = true;

  if (micInstance) {
    try {
      micInstance.stop();
    } catch (error) {
      console.error(`Error stopping microphone: ${error}`);
    }
    micInstance = null;
  }

  if (ws && [WebSocket.OPEN, WebSocket.CONNECTING].includes(ws.readyState)) {
    try {
      if (ws.readyState === WebSocket.OPEN) {
        const terminateMessage = { type: "Terminate" };
        console.log(
          `Sending termination message: ${JSON.stringify(terminateMessage)}`
        );
        ws.send(JSON.stringify(terminateMessage));
      }
      ws.close();
    } catch (error) {
      console.error(`Error closing WebSocket: ${error}`);
    }
    ws = null;
  }

  console.log("Cleanup complete.");
}

function setupTerminationHandlers() {
  process.on("SIGINT", () => {
    console.log("\nCtrl+C received. Stopping...");
    cleanup();
    setTimeout(() => process.exit(0), 1000);
  });

  process.on("SIGTERM", () => {
    console.log("\nTermination signal received. Stopping...");
    cleanup();
    setTimeout(() => process.exit(0), 1000);
  });

  process.on("uncaughtException", (error) => {
    console.error(`\nUncaught exception: ${error}`);
    cleanup();
    setTimeout(() => process.exit(1), 1000);
  });
}

// Start the application
run();

import { Readable } from "stream";
import { AssemblyAI } from "assemblyai";
import recorder from "node-record-lpcm16";

const run = async () => {
  const client = new AssemblyAI({
    apiKey: "<YOUR_API_KEY>",
  });

  const transcriber = client.streaming.transcriber({
    sampleRate: 16_000,
    speechModel: "u3-rt-pro",
    domain: "medical-v1",
  });

  transcriber.on("open", ({ id }) => {
    console.log(`Session opened with ID: ${id}`);
  });

  transcriber.on("error", (error) => {
    console.error("Error:", error);
  });

  transcriber.on("close", (code, reason) =>
    console.log("Session closed:", code, reason)
  );

  transcriber.on("turn", (turn) => {
    if (turn.turn_is_formatted) {
      process.stdout.write("\r" + " ".repeat(100) + "\r");
      console.log(turn.transcript);
    } else {
      process.stdout.write(`\r${turn.transcript}`);
    }
  });

  try {
    console.log("Connecting to streaming transcript service");

    await transcriber.connect();

    console.log("Starting recording");

    const recording = recorder.record({
      channels: 1,
      sampleRate: 16_000,
      audioType: "wav", // Linear PCM
    });

    Readable.toWeb(recording.stream()).pipeTo(transcriber.stream());

    // Stop recording and close connection using Ctrl-C.

    process.on("SIGINT", async function () {
      console.log();
      console.log("Stopping recording");
      recording.stop();

      console.log("Closing streaming transcript connection");
      await transcriber.close();

      process.exit();
    });
  } catch (error) {
    console.error(error);
  }
};

run();

Example output

Without Medical Mode:

I have here insulin to be used for both prandial mealtime and sliding scale is
insulin lisprohumalog subcutaneously.

With Medical Mode, lisprohumalog is updated to Lispro (Humalog) - following the standard medical convention of writing the generic name first, with the brand name in parentheses.

I have here insulin to be used for both prandial mealtime and sliding scale is
insulin Lispro (Humalog) subcutaneously.

Use cases

Medical Mode is designed for healthcare AI applications where accurate medical terminology is critical:

Ambient clinical documentation — Capture medication names, dosages, and clinical terms correctly during live patient encounters.
Real-time medical scribes — Deliver accurate transcripts to clinicians during or immediately after a consult.
Front-office voice agents — Handle drug names, provider names, and clinic-specific terminology in scheduling calls and insurance verification.
Medical contact centers — Transcribe calls with correct medical vocabulary for downstream processing and quality assurance.

Combine with other features

Medical Mode works alongside other streaming features. You can combine it with:

Streaming Diarization to identify who said what in clinical conversations
Keyterms Prompting to further boost accuracy for specific medical terms unique to your use case

Python
Python SDK
Javascript
JavaScript SDK

CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1",
    "speaker_labels": "true",
    "keyterms_prompt": json.dumps(["Lisinopril", "Metformin", "Humalog"])
}

client.connect(
    StreamingParameters(
        sample_rate=16000,
        speech_model="u3-rt-pro",
        domain="medical-v1",
        speaker_labels=True,
        keyterms_prompt=["Lisinopril", "Metformin", "Humalog"],
    )
)

const CONNECTION_PARAMS = {
  sample_rate: 16000,
  speech_model: "u3-rt-pro",
  domain: "medical-v1",
  speaker_labels: true,
  keyterms_prompt: JSON.stringify(["Lisinopril", "Metformin", "Humalog"]),
};

const transcriber = client.streaming.transcriber({
  sampleRate: 16_000,
  speechModel: "u3-rt-pro",
  domain: "medical-v1",
  speakerLabels: true,
  keytermsPrompt: ["Lisinopril", "Metformin", "Humalog"],
});

Configuration for medical audio

Medical conversations — such as clinical dictation, patient encounters, and ambient scribes — have different speech patterns than typical voice agent interactions. Clinicians often pause mid-sentence to think, review a chart, or formulate a diagnosis. The default turn detection settings are optimized for fast-paced voice agent dialogues and can incorrectly fragment these natural pauses into separate turns. To prevent premature turn boundaries in medical audio, increase the silence thresholds:

const streamingConfig = {
  min_turn_silence: 800,
  max_turn_silence: 3600,
};

Parameter	Default	Recommended for Medical	Why
`min_turn_silence`	`100` ms (U3 Pro) / `400` ms (Universal Streaming)	`800` ms	Gives clinicians time to pause mid-sentence without triggering a speculative end-of-turn check.
`max_turn_silence`	`1000` ms (U3 Pro) / `1280` ms (Universal Streaming)	`3600` ms	Allows extended pauses for chart review or thinking without forcing a turn boundary.

These values match the Conservative quick start configuration on the turn detection page. You can further adjust them based on your specific workflow — for example, a real-time medical scribe may benefit from a lower max_turn_silence (around 2000 ms) than a dictation application.

Python
Python SDK
Javascript
JavaScript SDK

CONNECTION_PARAMS = {
    "sample_rate": 16000,
    "speech_model": "u3-rt-pro",
    "domain": "medical-v1",
    "min_turn_silence": 800,
    "max_turn_silence": 3600,
}

client.connect(
    StreamingParameters(
        sample_rate=16000,
        speech_model="u3-rt-pro",
        domain="medical-v1",
        min_turn_silence=800,
        max_turn_silence=3600,
    )
)

const CONNECTION_PARAMS = {
  sample_rate: 16000,
  speech_model: "u3-rt-pro",
  domain: "medical-v1",
  min_turn_silence: 800,
  max_turn_silence: 3600,
};

const transcriber = client.streaming.transcriber({
  sampleRate: 16_000,
  speechModel: "u3-rt-pro",
  domain: "medical-v1",
  minTurnSilence: 800,
  maxTurnSilence: 3600,
});

Avoid setting end_of_turn_confidence_threshold to 0If you are using a Universal Streaming model (not U3 Pro), do not set end_of_turn_confidence_threshold to 0. This completely disables semantic turn detection and forces a turn boundary at every silence, which is especially harmful for medical audio where mid-sentence pauses are common. See Turn detection for details.

HIPAA compliance

AssemblyAI offers a Business Associate Agreement (BAA) for customers who need to process Protected Health Information (PHI). AssemblyAI is SOC 2 Type 2, ISO 27001:2022, and PCI DSS v4.0 certified. Medical Mode does not change existing data handling or retention policies. For BAA setup or enterprise pricing, contact our sales team.

Documentation Index

​Quickstart

​Example output

​Use cases

​Combine with other features

​Configuration for medical audio

​HIPAA compliance

Quickstart

Example output

Use cases

Combine with other features

Configuration for medical audio

HIPAA compliance