Documentation Index Fetch the complete documentation index at: https://assemblyai.com/docs/llms.txt
Use this file to discover all available pages before exploring further.
Medical Mode is an add-on that enhances streaming transcription accuracy for medical terminology — including medication names, procedures, conditions, and dosages. It is optimized for medical entity recognition to correct terms that other models frequently get wrong.
Medical Mode can be used with all of our Streaming STT models.
Enable Medical Mode by setting the domain connection parameter to "medical-v1". No other changes to your existing pipeline are required.
Medical Mode is billed as a separate add-on. See the pricing page for details.
Quickstart
Python
Python SDK
Javascript
JavaScript SDK
pip install websocket-client pyaudio
npm install assemblyai node-record-lpcm16
The module node-record-lpcm16 requires SoX and it must be available in your $PATH. For Mac OS: For most linux disto’s: sudo apt-get install sox libsox-fmt-all
For Windows: download the binaries
Python
Python SDK
Javascript
JavaScript SDK
import pyaudio
import websocket
import json
import threading
import time
from urllib.parse import urlencode
from datetime import datetime
# --- Configuration ---
YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key
CONNECTION_PARAMS = {
"sample_rate" : 16000 ,
"speech_model" : "u3-rt-pro" ,
"domain" : "medical-v1"
}
API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
API_ENDPOINT = f " { API_ENDPOINT_BASE_URL } ? { urlencode( CONNECTION_PARAMS ) } "
# Audio Configuration
FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz)
SAMPLE_RATE = CONNECTION_PARAMS [ "sample_rate" ]
CHANNELS = 1
FORMAT = pyaudio.paInt16
# Global variables for audio stream and websocket
audio = None
stream = None
ws_app = None
audio_thread = None
stop_event = threading.Event() # To signal the audio thread to stop
# --- WebSocket Event Handlers ---
def on_open ( ws ):
"""Called when the WebSocket connection is established."""
print ( "WebSocket connection opened." )
print ( f "Connected to: { API_ENDPOINT } " )
# Start sending audio data in a separate thread
def stream_audio ():
global stream
print ( "Starting audio streaming..." )
while not stop_event.is_set():
try :
audio_data = stream.read( FRAMES_PER_BUFFER , exception_on_overflow = False )
# Send audio data as binary message
ws.send(audio_data, websocket. ABNF . OPCODE_BINARY )
except Exception as e:
print ( f "Error streaming audio: { e } " )
break
print ( "Audio streaming stopped." )
global audio_thread
audio_thread = threading.Thread( target = stream_audio)
audio_thread.daemon = True
audio_thread.start()
def on_message ( ws , message ):
try :
data = json.loads(message)
msg_type = data.get( 'type' )
if msg_type == "Begin" :
session_id = data.get( 'id' )
expires_at = data.get( 'expires_at' )
print ( f " \n Session began: ID= { session_id } , ExpiresAt= { datetime.fromtimestamp(expires_at) } " )
elif msg_type == "Turn" :
transcript = data.get( 'transcript' , '' )
formatted = data.get( 'turn_is_formatted' , False )
if formatted:
print ( ' \r ' + ' ' * 80 + ' \r ' , end = '' )
print (transcript)
else :
print ( f " \r { transcript } " , end = '' )
elif msg_type == "Termination" :
audio_duration = data.get( 'audio_duration_seconds' , 0 )
session_duration = data.get( 'session_duration_seconds' , 0 )
print ( f " \n Session Terminated: Audio Duration= { audio_duration } s, Session Duration= { session_duration } s" )
except json.JSONDecodeError as e:
print ( f "Error decoding message: { e } " )
except Exception as e:
print ( f "Error handling message: { e } " )
def on_error ( ws , error ):
"""Called when a WebSocket error occurs."""
print ( f " \n WebSocket Error: { error } " )
stop_event.set()
def on_close ( ws , close_status_code , close_msg ):
"""Called when the WebSocket connection is closed."""
print ( f " \n WebSocket Disconnected: Status= { close_status_code } , Msg= { close_msg } " )
global stream, audio
stop_event.set()
if stream:
if stream.is_active():
stream.stop_stream()
stream.close()
stream = None
if audio:
audio.terminate()
audio = None
if audio_thread and audio_thread.is_alive():
audio_thread.join( timeout = 1.0 )
# --- Main Execution ---
def run ():
global audio, stream, ws_app
audio = pyaudio.PyAudio()
try :
stream = audio.open(
input = True ,
frames_per_buffer = FRAMES_PER_BUFFER ,
channels = CHANNELS ,
format = FORMAT ,
rate = SAMPLE_RATE ,
)
print ( "Microphone stream opened successfully." )
print ( "Speak into your microphone. Press Ctrl+C to stop." )
except Exception as e:
print ( f "Error opening microphone stream: { e } " )
if audio:
audio.terminate()
return
ws_app = websocket.WebSocketApp(
API_ENDPOINT ,
header = { "Authorization" : YOUR_API_KEY },
on_open = on_open,
on_message = on_message,
on_error = on_error,
on_close = on_close,
)
ws_thread = threading.Thread( target = ws_app.run_forever)
ws_thread.daemon = True
ws_thread.start()
try :
while ws_thread.is_alive():
time.sleep( 0.1 )
except KeyboardInterrupt :
print ( " \n Ctrl+C received. Stopping..." )
stop_event.set()
if ws_app and ws_app.sock and ws_app.sock.connected:
try :
terminate_message = { "type" : "Terminate" }
print ( f "Sending termination message: { json.dumps(terminate_message) } " )
ws_app.send(json.dumps(terminate_message))
time.sleep( 5 )
except Exception as e:
print ( f "Error sending termination message: { e } " )
if ws_app:
ws_app.close()
ws_thread.join( timeout = 2.0 )
except Exception as e:
print ( f " \n An unexpected error occurred: { e } " )
stop_event.set()
if ws_app:
ws_app.close()
ws_thread.join( timeout = 2.0 )
finally :
if stream and stream.is_active():
stream.stop_stream()
if stream:
stream.close()
if audio:
audio.terminate()
print ( "Cleanup complete. Exiting." )
if __name__ == "__main__" :
run()
See all 187 lines
import logging
from typing import Type
import assemblyai as aai
from assemblyai.streaming.v3 import (
BeginEvent,
StreamingClient,
StreamingClientOptions,
StreamingError,
StreamingEvents,
StreamingParameters,
TerminationEvent,
TurnEvent,
)
api_key = "<YOUR_API_KEY>"
logging.basicConfig( level = logging. INFO )
logger = logging.getLogger( __name__ )
def on_begin ( self : Type[StreamingClient], event : BeginEvent):
print ( f "Session started: { event.id } " )
def on_turn ( self : Type[StreamingClient], event : TurnEvent):
if event.turn_is_formatted:
print ( f " \r { ' ' * 100 } \r { event.transcript } " )
else :
print ( f " \r { event.transcript } " , end = '' , flush = True )
def on_terminated ( self : Type[StreamingClient], event : TerminationEvent):
print (
f "Session terminated: { event.audio_duration_seconds } seconds of audio processed"
)
def on_error ( self : Type[StreamingClient], error : StreamingError):
print ( f "Error occurred: { error } " )
def main ():
client = StreamingClient(
StreamingClientOptions(
api_key = api_key,
api_host = "streaming.assemblyai.com" ,
)
)
client.on(StreamingEvents.Begin, on_begin)
client.on(StreamingEvents.Turn, on_turn)
client.on(StreamingEvents.Termination, on_terminated)
client.on(StreamingEvents.Error, on_error)
client.connect(
StreamingParameters(
sample_rate = 16000 ,
speech_model = "u3-rt-pro" ,
domain = "medical-v1" ,
)
)
try :
client.stream(
aai.extras.MicrophoneStream( sample_rate = 16000 )
)
finally :
client.disconnect( terminate = True )
if __name__ == "__main__" :
main()
See all 73 lines
const WebSocket = require ( "ws" );
const mic = require ( "mic" );
const querystring = require ( "querystring" );
// --- Configuration ---
const YOUR_API_KEY = "YOUR-API-KEY" ; // Replace with your actual API key
const CONNECTION_PARAMS = {
sample_rate: 16000 ,
speech_model: "u3-rt-pro" ,
domain: "medical-v1" ,
};
const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" ;
const API_ENDPOINT = ` ${ API_ENDPOINT_BASE_URL } ? ${ querystring . stringify ( CONNECTION_PARAMS ) } ` ;
// Audio Configuration
const SAMPLE_RATE = CONNECTION_PARAMS . sample_rate ;
const CHANNELS = 1 ;
// Global variables
let micInstance = null ;
let micInputStream = null ;
let ws = null ;
let stopRequested = false ;
// --- Helper functions ---
function formatTimestamp ( timestamp ) {
return new Date ( timestamp * 1000 ). toISOString ();
}
// --- Main function ---
async function run () {
console . log ( "Starting AssemblyAI real-time transcription..." );
ws = new WebSocket ( API_ENDPOINT , {
headers: {
Authorization: YOUR_API_KEY ,
},
});
ws . on ( "open" , () => {
console . log ( "WebSocket connection opened." );
console . log ( `Connected to: ${ API_ENDPOINT } ` );
startMicrophone ();
});
ws . on ( "message" , ( message ) => {
try {
const data = JSON . parse ( message );
const msgType = data . type ;
if ( msgType === "Begin" ) {
const sessionId = data . id ;
const expiresAt = data . expires_at ;
console . log (
` \n Session began: ID= ${ sessionId } , ExpiresAt= ${ formatTimestamp ( expiresAt ) } `
);
} else if ( msgType === "Turn" ) {
const transcript = data . transcript || "" ;
const formatted = data . turn_is_formatted ;
if ( formatted ) {
process . stdout . write ( " \r " + " " . repeat ( 100 ) + " \r " );
console . log ( transcript );
} else {
process . stdout . write ( ` \r ${ transcript } ` );
}
} else if ( msgType === "Termination" ) {
const audioDuration = data . audio_duration_seconds ;
const sessionDuration = data . session_duration_seconds ;
console . log (
` \n Session Terminated: Audio Duration= ${ audioDuration } s, Session Duration= ${ sessionDuration } s`
);
}
} catch ( error ) {
console . error ( ` \n Error handling message: ${ error } ` );
console . error ( `Message data: ${ message } ` );
}
});
ws . on ( "error" , ( error ) => {
console . error ( ` \n WebSocket Error: ${ error } ` );
cleanup ();
});
ws . on ( "close" , ( code , reason ) => {
console . log ( ` \n WebSocket Disconnected: Status= ${ code } , Msg= ${ reason } ` );
cleanup ();
});
setupTerminationHandlers ();
}
function startMicrophone () {
try {
micInstance = mic ({
rate: SAMPLE_RATE . toString (),
channels: CHANNELS . toString (),
debug: false ,
exitOnSilence: 6 ,
});
micInputStream = micInstance . getAudioStream ();
micInputStream . on ( "data" , ( data ) => {
if ( ws && ws . readyState === WebSocket . OPEN && ! stopRequested ) {
ws . send ( data );
}
});
micInputStream . on ( "error" , ( err ) => {
console . error ( `Microphone Error: ${ err } ` );
cleanup ();
});
micInstance . start ();
console . log ( "Microphone stream opened successfully." );
console . log ( "Speak into your microphone. Press Ctrl+C to stop." );
} catch ( error ) {
console . error ( `Error opening microphone stream: ${ error } ` );
cleanup ();
}
}
function cleanup () {
stopRequested = true ;
if ( micInstance ) {
try {
micInstance . stop ();
} catch ( error ) {
console . error ( `Error stopping microphone: ${ error } ` );
}
micInstance = null ;
}
if ( ws && [ WebSocket . OPEN , WebSocket . CONNECTING ]. includes ( ws . readyState )) {
try {
if ( ws . readyState === WebSocket . OPEN ) {
const terminateMessage = { type: "Terminate" };
console . log (
`Sending termination message: ${ JSON . stringify ( terminateMessage ) } `
);
ws . send ( JSON . stringify ( terminateMessage ));
}
ws . close ();
} catch ( error ) {
console . error ( `Error closing WebSocket: ${ error } ` );
}
ws = null ;
}
console . log ( "Cleanup complete." );
}
function setupTerminationHandlers () {
process . on ( "SIGINT" , () => {
console . log ( " \n Ctrl+C received. Stopping..." );
cleanup ();
setTimeout (() => process . exit ( 0 ), 1000 );
});
process . on ( "SIGTERM" , () => {
console . log ( " \n Termination signal received. Stopping..." );
cleanup ();
setTimeout (() => process . exit ( 0 ), 1000 );
});
process . on ( "uncaughtException" , ( error ) => {
console . error ( ` \n Uncaught exception: ${ error } ` );
cleanup ();
setTimeout (() => process . exit ( 1 ), 1000 );
});
}
// Start the application
run ();
See all 176 lines
import { Readable } from "stream" ;
import { AssemblyAI } from "assemblyai" ;
import recorder from "node-record-lpcm16" ;
const run = async () => {
const client = new AssemblyAI ({
apiKey: "<YOUR_API_KEY>" ,
});
const transcriber = client . streaming . transcriber ({
sampleRate: 16_000 ,
speechModel: "u3-rt-pro" ,
domain: "medical-v1" ,
});
transcriber . on ( "open" , ({ id }) => {
console . log ( `Session opened with ID: ${ id } ` );
});
transcriber . on ( "error" , ( error ) => {
console . error ( "Error:" , error );
});
transcriber . on ( "close" , ( code , reason ) =>
console . log ( "Session closed:" , code , reason )
);
transcriber . on ( "turn" , ( turn ) => {
if ( turn . turn_is_formatted ) {
process . stdout . write ( " \r " + " " . repeat ( 100 ) + " \r " );
console . log ( turn . transcript );
} else {
process . stdout . write ( ` \r ${ turn . transcript } ` );
}
});
try {
console . log ( "Connecting to streaming transcript service" );
await transcriber . connect ();
console . log ( "Starting recording" );
const recording = recorder . record ({
channels: 1 ,
sampleRate: 16_000 ,
audioType: "wav" , // Linear PCM
});
Readable . toWeb ( recording . stream ()). pipeTo ( transcriber . stream ());
// Stop recording and close connection using Ctrl-C.
process . on ( "SIGINT" , async function () {
console . log ();
console . log ( "Stopping recording" );
recording . stop ();
console . log ( "Closing streaming transcript connection" );
await transcriber . close ();
process . exit ();
});
} catch ( error ) {
console . error ( error );
}
};
run ();
See all 69 lines
Example output
Without Medical Mode:
I have here insulin to be used for both prandial mealtime and sliding scale is
insulin lisprohumalog subcutaneously.
With Medical Mode, lisprohumalog is updated to Lispro (Humalog) - following the standard medical convention of writing the generic name first, with the brand name in parentheses.
I have here insulin to be used for both prandial mealtime and sliding scale is
insulin Lispro (Humalog) subcutaneously.
Use cases
Medical Mode is designed for healthcare AI applications where accurate medical terminology is critical:
Ambient clinical documentation — Capture medication names, dosages, and clinical terms correctly during live patient encounters.
Real-time medical scribes — Deliver accurate transcripts to clinicians during or immediately after a consult.
Front-office voice agents — Handle drug names, provider names, and clinic-specific terminology in scheduling calls and insurance verification.
Medical contact centers — Transcribe calls with correct medical vocabulary for downstream processing and quality assurance.
Combine with other features
Medical Mode works alongside other streaming features. You can combine it with:
Python
Python SDK
Javascript
JavaScript SDK
CONNECTION_PARAMS = {
"sample_rate" : 16000 ,
"speech_model" : "u3-rt-pro" ,
"domain" : "medical-v1" ,
"speaker_labels" : "true" ,
"keyterms_prompt" : json.dumps([ "Lisinopril" , "Metformin" , "Humalog" ])
}
client.connect(
StreamingParameters(
sample_rate = 16000 ,
speech_model = "u3-rt-pro" ,
domain = "medical-v1" ,
speaker_labels = True ,
keyterms_prompt = [ "Lisinopril" , "Metformin" , "Humalog" ],
)
)
const CONNECTION_PARAMS = {
sample_rate: 16000 ,
speech_model: "u3-rt-pro" ,
domain: "medical-v1" ,
speaker_labels: true ,
keyterms_prompt: JSON . stringify ([ "Lisinopril" , "Metformin" , "Humalog" ]),
};
const transcriber = client . streaming . transcriber ({
sampleRate: 16_000 ,
speechModel: "u3-rt-pro" ,
domain: "medical-v1" ,
speakerLabels: true ,
keytermsPrompt: [ "Lisinopril" , "Metformin" , "Humalog" ],
});
Configuration for medical audio
Medical conversations — such as clinical dictation, patient encounters, and ambient scribes — have different speech patterns than typical voice agent interactions. Clinicians often pause mid-sentence to think, review a chart, or formulate a diagnosis. The default turn detection settings are optimized for fast-paced voice agent dialogues and can incorrectly fragment these natural pauses into separate turns.
To prevent premature turn boundaries in medical audio, increase the silence thresholds:
const streamingConfig = {
min_turn_silence: 800 ,
max_turn_silence: 3600 ,
};
Parameter Default Recommended for Medical Why min_turn_silence100 ms (U3 Pro) / 400 ms (Universal Streaming)800 msGives clinicians time to pause mid-sentence without triggering a speculative end-of-turn check. max_turn_silence1000 ms (U3 Pro) / 1280 ms (Universal Streaming)3600 msAllows extended pauses for chart review or thinking without forcing a turn boundary.
These values match the Conservative quick start configuration on the turn detection page. You can further adjust them based on your specific workflow — for example, a real-time medical scribe may benefit from a lower max_turn_silence (around 2000 ms) than a dictation application.
Python
Python SDK
Javascript
JavaScript SDK
CONNECTION_PARAMS = {
"sample_rate" : 16000 ,
"speech_model" : "u3-rt-pro" ,
"domain" : "medical-v1" ,
"min_turn_silence" : 800 ,
"max_turn_silence" : 3600 ,
}
client.connect(
StreamingParameters(
sample_rate = 16000 ,
speech_model = "u3-rt-pro" ,
domain = "medical-v1" ,
min_turn_silence = 800 ,
max_turn_silence = 3600 ,
)
)
const CONNECTION_PARAMS = {
sample_rate: 16000 ,
speech_model: "u3-rt-pro" ,
domain: "medical-v1" ,
min_turn_silence: 800 ,
max_turn_silence: 3600 ,
};
const transcriber = client . streaming . transcriber ({
sampleRate: 16_000 ,
speechModel: "u3-rt-pro" ,
domain: "medical-v1" ,
minTurnSilence: 800 ,
maxTurnSilence: 3600 ,
});
Avoid setting end_of_turn_confidence_threshold to 0 If you are using a Universal Streaming model (not U3 Pro), do not set end_of_turn_confidence_threshold to 0. This completely disables semantic turn detection and forces a turn boundary at every silence, which is especially harmful for medical audio where mid-sentence pauses are common. See Turn detection for details.
HIPAA compliance
AssemblyAI offers a Business Associate Agreement (BAA) for customers who need to process Protected Health Information (PHI). AssemblyAI is SOC 2 Type 2, ISO 27001:2022, and PCI DSS v4.0 certified. Medical Mode does not change existing data handling or retention policies.
For BAA setup or enterprise pricing, contact our sales team .