Transcribe audio files with Streaming

This guide shows you how to transcribe WAV audio files with varying sample rates using our Streaming API.

Quickstart

Here is the complete Python script to transcribe a WAV audio file using the Streaming API.

1import websocket
2import json
3import threading
4import time
5import wave
6import sys
7import os
8from urllib.parse import urlencode
9from pathlib import Path
10
11# --- Configuration ---
12ASSEMBLYAI_API_KEY = os.environ["ASSEMBLYAI_API_KEY"]
13AUDIO_FILE = "audio.wav" # Path to your audio file
14SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file
15SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file
16PLAY_AUDIO = True # Set to False to disable audio playback
17
18CONNECTION_PARAMS = {
19 "speech_model": "u3-rt-pro",
20 "sample_rate": SAMPLE_RATE,
21}
22API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
23API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"
24
25# Global variables
26ws_app = None
27audio_thread = None
28stop_event = threading.Event()
29
30# Track session data for output file
31session_data = {
32 "session_id": None,
33 "audio_file": AUDIO_FILE,
34 "audio_duration_seconds": None,
35 "turns": []
36}
37
38# --- Helper Functions ---
39
40def validate_audio_file(filepath, sample_rate):
41 """Validate audio file before streaming."""
42 file_ext = Path(filepath).suffix.lower()
43 if file_ext != ".wav":
44 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr)
45 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
46 sys.exit(1)
47
48 with wave.open(filepath, 'rb') as wav_file:
49 if wav_file.getnchannels() != 1:
50 print("Error: Only mono audio is supported", file=sys.stderr)
51 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
52 sys.exit(1)
53
54 file_sample_rate = wav_file.getframerate()
55 if file_sample_rate != sample_rate:
56 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr)
57 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
58 sys.exit(1)
59
60
61def save_transcript():
62 """Save the transcript to a file in the same directory as the script."""
63 audio_name = Path(session_data["audio_file"]).stem
64 session_id = session_data["session_id"] or "unknown"
65 output_file = f"{audio_name}_{session_id}.txt"
66
67 with open(output_file, "w") as f:
68 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n")
69 f.write(f"Audio file: {session_data['audio_file']}\n")
70 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n")
71 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/universal-3-pro-streaming/universal-3-pro-streaming#request.query\n\n")
72 f.write("\nTranscription Output\n")
73 for i, turn in enumerate(session_data["turns"], 1):
74 f.write(f"[Turn #{i}]: {turn}\n")
75
76 print(f"Transcript saved to {output_file}")
77
78
79# --- WebSocket Event Handlers ---
80
81def on_open(ws):
82 """Called when the WebSocket connection is established."""
83 print("WebSocket connection opened.")
84 print(f"Connected to: {API_ENDPOINT}")
85
86 def stream_file():
87 chunk_duration = 0.05 # 50ms chunks
88 audio_player = None
89
90 if PLAY_AUDIO:
91 try:
92 import pyaudio
93 p = pyaudio.PyAudio()
94 with wave.open(AUDIO_FILE, 'rb') as wav_file:
95 audio_player = p.open(
96 format=p.get_format_from_width(wav_file.getsampwidth()),
97 channels=wav_file.getnchannels(),
98 rate=wav_file.getframerate(),
99 output=True
100 )
101 except ImportError:
102 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr)
103 print("Install with: pip install pyaudio", file=sys.stderr)
104
105 try:
106 with wave.open(AUDIO_FILE, 'rb') as wav_file:
107 frames_per_chunk = int(SAMPLE_RATE * chunk_duration)
108
109 while not stop_event.is_set():
110 frames = wav_file.readframes(frames_per_chunk)
111 if not frames:
112 break
113
114 if audio_player:
115 audio_player.write(frames)
116 else:
117 time.sleep(chunk_duration)
118
119 ws.send(frames, websocket.ABNF.OPCODE_BINARY)
120 finally:
121 if audio_player:
122 audio_player.stop_stream()
123 audio_player.close()
124 p.terminate()
125
126 # All audio sent - terminate the session
127 print("File streaming complete. Waiting for final transcripts...")
128 try:
129 ws.send(json.dumps({"type": "Terminate"}))
130 except Exception:
131 pass
132
133 global audio_thread
134 audio_thread = threading.Thread(target=stream_file)
135 audio_thread.daemon = True
136 audio_thread.start()
137
138
139def on_message(ws, message):
140 try:
141 data = json.loads(message)
142 msg_type = data.get('type')
143
144 if msg_type == "Begin":
145 session_data["session_id"] = data.get('id')
146 print(f"Session ID: {data.get('id')}\n")
147 elif msg_type == "Turn":
148 transcript = data.get('transcript', '')
149 if not transcript:
150 return
151
152 if data.get('end_of_turn'):
153 print(f"[Final]: {transcript}\n")
154 session_data["turns"].append(transcript)
155 else:
156 print(f"[Partial]: {transcript}")
157 elif msg_type == "Termination":
158 session_data["audio_duration_seconds"] = data.get('audio_duration_seconds', 0)
159 print(f"Session terminated: {data.get('audio_duration_seconds', 0)} seconds of audio processed")
160 except json.JSONDecodeError as e:
161 print(f"Error decoding message: {e}")
162 except Exception as e:
163 print(f"Error handling message: {e}")
164
165
166def on_error(ws, error):
167 """Called when a WebSocket error occurs."""
168 print(f"\nWebSocket Error: {error}")
169 stop_event.set()
170
171
172def on_close(ws, close_status_code, close_msg):
173 """Called when the WebSocket connection is closed."""
174 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
175 stop_event.set()
176
177 if SAVE_TRANSCRIPT_TO_FILE:
178 save_transcript()
179
180 if audio_thread and audio_thread.is_alive():
181 audio_thread.join(timeout=1.0)
182
183
184# --- Main Execution ---
185def run():
186 global ws_app
187
188 # Validate audio file before connecting
189 validate_audio_file(AUDIO_FILE, SAMPLE_RATE)
190
191 # Create WebSocketApp
192 ws_app = websocket.WebSocketApp(
193 API_ENDPOINT,
194 header={"Authorization": ASSEMBLYAI_API_KEY},
195 on_open=on_open,
196 on_message=on_message,
197 on_error=on_error,
198 on_close=on_close,
199 )
200
201 # Run WebSocketApp in a separate thread
202 ws_thread = threading.Thread(target=ws_app.run_forever)
203 ws_thread.daemon = True
204 ws_thread.start()
205
206 try:
207 while ws_thread.is_alive():
208 time.sleep(0.1)
209 except KeyboardInterrupt:
210 print("\nCtrl+C received. Stopping...")
211 stop_event.set()
212
213 if ws_app and ws_app.sock and ws_app.sock.connected:
214 try:
215 ws_app.send(json.dumps({"type": "Terminate"}))
216 time.sleep(2)
217 except Exception as e:
218 print(f"Error sending termination message: {e}")
219
220 if ws_app:
221 ws_app.close()
222 ws_thread.join(timeout=2.0)
223
224 except Exception as e:
225 print(f"\nAn unexpected error occurred: {e}")
226 stop_event.set()
227 if ws_app:
228 ws_app.close()
229 ws_thread.join(timeout=2.0)
230
231 finally:
232 print("Cleanup complete. Exiting.")
233
234
235if __name__ == "__main__":
236 run()

Step-by-step guide

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up and get your API key from your dashboard.

Install and import packages

Install the required packages. PyAudio is optional — only needed for audio playback during streaming.

$pip install websocket-client
$pip install pyaudio

Import packages.

1import websocket
2import json
3import os
4import threading
5import time
6import wave
7import sys
8from urllib.parse import urlencode
9from pathlib import Path

Configure settings

Set your ASSEMBLYAI_API_KEY environment variable.

Set AUDIO_FILE to the relative or absolute path of your audio file, and set SAMPLE_RATE to match your file’s sample rate.

1ASSEMBLYAI_API_KEY = os.environ["ASSEMBLYAI_API_KEY"]
2AUDIO_FILE = "audio.wav" # Path to your audio file
3SAMPLE_RATE = 48000 # Change to match the sample rate of your audio file
4SAVE_TRANSCRIPT_TO_FILE = True # Set to False to disable saving transcript to file
5PLAY_AUDIO = True # Set to False to disable audio playback
6
7CONNECTION_PARAMS = {
8 "speech_model": "u3-rt-pro",
9 "sample_rate": SAMPLE_RATE,
10}
11API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"
12API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}"
13
14# Global variables
15ws_app = None
16audio_thread = None
17stop_event = threading.Event()
18
19# Track session data for output file
20session_data = {
21 "session_id": None,
22 "audio_file": AUDIO_FILE,
23 "audio_duration_seconds": None,
24 "turns": []
25}

Helper functions

The following helper functions are used to validate audio files and save the transcript output:

  • validate_audio_file() - Validates that the audio file is a mono WAV file with the expected sample rate.
  • save_transcript() - Saves the transcript to a text file after the session ends.
1def validate_audio_file(filepath, sample_rate):
2 """Validate audio file before streaming."""
3 file_ext = Path(filepath).suffix.lower()
4 if file_ext != ".wav":
5 print(f"Error: Only WAV files are supported. Got: {file_ext}", file=sys.stderr)
6 print(f"Convert your file to WAV using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
7 sys.exit(1)
8
9 with wave.open(filepath, 'rb') as wav_file:
10 if wav_file.getnchannels() != 1:
11 print("Error: Only mono audio is supported", file=sys.stderr)
12 print(f"Convert your file to mono using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
13 sys.exit(1)
14
15 file_sample_rate = wav_file.getframerate()
16 if file_sample_rate != sample_rate:
17 print(f"Error: File sample rate ({file_sample_rate}) doesn't match expected rate ({sample_rate})", file=sys.stderr)
18 print(f"Either update SAMPLE_RATE to {file_sample_rate}, or convert your file using: ffmpeg -i {filepath} -ar {sample_rate} -ac 1 output.wav", file=sys.stderr)
19 sys.exit(1)
20
21
22def save_transcript():
23 """Save the transcript to a file in the same directory as the script."""
24 audio_name = Path(session_data["audio_file"]).stem
25 session_id = session_data["session_id"] or "unknown"
26 output_file = f"{audio_name}_{session_id}.txt"
27
28 with open(output_file, "w") as f:
29 f.write(f"AssemblyAI Session ID: {session_data['session_id']}\n")
30 f.write(f"Audio file: {session_data['audio_file']}\n")
31 f.write(f"Audio duration: {session_data['audio_duration_seconds']} seconds\n")
32 f.write("See all available parameters and defaults at https://www.assemblyai.com/docs/api-reference/streaming-api/universal-3-pro-streaming/universal-3-pro-streaming#request.query\n\n")
33 f.write("\nTranscription Output\n")
34 for i, turn in enumerate(session_data["turns"], 1):
35 f.write(f"[Turn #{i}]: {turn}\n")
36
37 print(f"Transcript saved to {output_file}")

WebSocket event handlers

Open WebSocket and stream audio file

When the connection opens, we start a background thread that reads the WAV file in 50ms chunks and sends them over the WebSocket. If PLAY_AUDIO is enabled, the audio is also played through your speakers.

1def on_open(ws):
2 """Called when the WebSocket connection is established."""
3 print("WebSocket connection opened.")
4 print(f"Connected to: {API_ENDPOINT}")
5
6 def stream_file():
7 chunk_duration = 0.05 # 50ms chunks
8 audio_player = None
9
10 if PLAY_AUDIO:
11 try:
12 import pyaudio
13 p = pyaudio.PyAudio()
14 with wave.open(AUDIO_FILE, 'rb') as wav_file:
15 audio_player = p.open(
16 format=p.get_format_from_width(wav_file.getsampwidth()),
17 channels=wav_file.getnchannels(),
18 rate=wav_file.getframerate(),
19 output=True
20 )
21 except ImportError:
22 print("Warning: pyaudio not installed. Audio playback disabled.", file=sys.stderr)
23 print("Install with: pip install pyaudio", file=sys.stderr)
24
25 try:
26 with wave.open(AUDIO_FILE, 'rb') as wav_file:
27 frames_per_chunk = int(SAMPLE_RATE * chunk_duration)
28
29 while not stop_event.is_set():
30 frames = wav_file.readframes(frames_per_chunk)
31 if not frames:
32 break
33
34 if audio_player:
35 audio_player.write(frames)
36 else:
37 time.sleep(chunk_duration)
38
39 ws.send(frames, websocket.ABNF.OPCODE_BINARY)
40 finally:
41 if audio_player:
42 audio_player.stop_stream()
43 audio_player.close()
44 p.terminate()
45
46 # All audio sent - terminate the session
47 print("File streaming complete. Waiting for final transcripts...")
48 try:
49 ws.send(json.dumps({"type": "Terminate"}))
50 except Exception:
51 pass
52
53 global audio_thread
54 audio_thread = threading.Thread(target=stream_file)
55 audio_thread.daemon = True
56 audio_thread.start()

Handle WebSocket messages

1def on_message(ws, message):
2 try:
3 data = json.loads(message)
4 msg_type = data.get('type')
5
6 if msg_type == "Begin":
7 session_data["session_id"] = data.get('id')
8 print(f"Session ID: {data.get('id')}\n")
9 elif msg_type == "Turn":
10 transcript = data.get('transcript', '')
11 if not transcript:
12 return
13
14 if data.get('end_of_turn'):
15 print(f"[Final]: {transcript}\n")
16 session_data["turns"].append(transcript)
17 else:
18 print(f"[Partial]: {transcript}")
19 elif msg_type == "Termination":
20 session_data["audio_duration_seconds"] = data.get('audio_duration_seconds', 0)
21 print(f"Session terminated: {data.get('audio_duration_seconds', 0)} seconds of audio processed")
22 except json.JSONDecodeError as e:
23 print(f"Error decoding message: {e}")
24 except Exception as e:
25 print(f"Error handling message: {e}")

WebSocket error and close handlers

1def on_error(ws, error):
2 """Called when a WebSocket error occurs."""
3 print(f"\nWebSocket Error: {error}")
4 stop_event.set()
5
6
7def on_close(ws, close_status_code, close_msg):
8 """Called when the WebSocket connection is closed."""
9 print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}")
10 stop_event.set()
11
12 if SAVE_TRANSCRIPT_TO_FILE:
13 save_transcript()
14
15 if audio_thread and audio_thread.is_alive():
16 audio_thread.join(timeout=1.0)

Connect and stream the file

1def run():
2 global ws_app
3
4 # Validate audio file before connecting
5 validate_audio_file(AUDIO_FILE, SAMPLE_RATE)
6
7 # Create WebSocketApp
8 ws_app = websocket.WebSocketApp(
9 API_ENDPOINT,
10 header={"Authorization": ASSEMBLYAI_API_KEY},
11 on_open=on_open,
12 on_message=on_message,
13 on_error=on_error,
14 on_close=on_close,
15 )
16
17 # Run WebSocketApp in a separate thread
18 ws_thread = threading.Thread(target=ws_app.run_forever)
19 ws_thread.daemon = True
20 ws_thread.start()
21
22 try:
23 while ws_thread.is_alive():
24 time.sleep(0.1)
25 except KeyboardInterrupt:
26 print("\nCtrl+C received. Stopping...")
27 stop_event.set()
28
29 if ws_app and ws_app.sock and ws_app.sock.connected:
30 try:
31 ws_app.send(json.dumps({"type": "Terminate"}))
32 time.sleep(2)
33 except Exception as e:
34 print(f"Error sending termination message: {e}")
35
36 if ws_app:
37 ws_app.close()
38 ws_thread.join(timeout=2.0)
39
40 except Exception as e:
41 print(f"\nAn unexpected error occurred: {e}")
42 stop_event.set()
43 if ws_app:
44 ws_app.close()
45 ws_thread.join(timeout=2.0)
46
47 finally:
48 print("Cleanup complete. Exiting.")
49
50
51if __name__ == "__main__":
52 run()

The session will terminate once the file is finished streaming. If SAVE_TRANSCRIPT_TO_FILE is enabled (default), the transcript will be saved to {audio_filename}_{session_id}.txt in the current working directory.

The AUDIO_FILE path can be either relative (e.g., audio.wav) or absolute (e.g., /path/to/audio.wav).

Example output

Here’s an example of what the console output looks like when streaming an audio file:

1WebSocket connection opened.
2Connected to: wss://streaming.assemblyai.com/v3/ws?speech_model=u3-rt-pro&sample_rate=48000
3
4Session ID: f37d7c4e-6be9-47ed-b6fc-7600fc78e34d
5
6[Partial]: the
7[Partial]: the quick
8[Partial]: the quick brown
9[Partial]: the quick brown fox
10[Partial]: the quick brown fox jumps
11[Partial]: the quick brown fox jumps over
12[Partial]: the quick brown fox jumps over the
13[Partial]: the quick brown fox jumps over the lazy
14[Partial]: The quick brown fox jumps over the lazy dog
15[Final]: The quick brown fox jumps over the lazy dog.
16
17[Partial]: It
18[Partial]: It is
19[Partial]: It is a
20[Partial]: It is a common
21[Partial]: It is a common typing
22[Partial]: It is a common typing test
23[Final]: It is a common typing test.
24
25File streaming complete. Waiting for final transcripts...
26Session terminated: 7.52 seconds of audio processed
27
28WebSocket Disconnected: Status=1000, Msg=None
29Transcript saved to audio_f37d7c4e-6be9-47ed-b6fc-7600fc78e34d.txt
30Cleanup complete. Exiting.

The output shows:

  • Partial transcripts: Real-time updates as words are recognized
  • Final: The complete turn with proper capitalization and punctuation