Skip to content

Realtime Voice (Speech-to-Speech)

RoomKit's RealtimeVoiceChannel enables speech-to-speech AI conversations using providers like OpenAI Realtime API, Google Gemini Live, xAI Grok Realtime, and ElevenLabs Conversational AI. Audio flows directly between the client and the AI provider — no separate STT/TTS stages.

How It Differs from VoiceChannel

Aspect VoiceChannel RealtimeVoiceChannel
Audio flow Client → STT → AI → TTS → Client Client ↔ Provider (direct)
VAD Client-side (AudioPipeline) Server-side (provider)
Transcription Generated by STT provider Provided by realtime provider
Tool calling Via AIChannel + hooks Direct provider callbacks
Latency Higher (STT + AI + TTS) Lower (single round-trip)

Quick Start

from __future__ import annotations

from roomkit import RoomKit
from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.backends.mock import MockVoiceBackend
from roomkit.voice.realtime.providers.openai import OpenAIRealtimeProvider

provider = OpenAIRealtimeProvider(
    api_key="sk-...",
    model="gpt-4o-realtime-preview",
)

transport = MockVoiceBackend()

channel = RealtimeVoiceChannel(
    "voice-realtime",
    provider=provider,
    transport=transport,
    system_prompt="You are a helpful voice assistant.",
    voice="alloy",
    input_sample_rate=16000,
    output_sample_rate=24000,
    emit_transcription_events=True,
)

kit = RoomKit()
kit.register_channel(channel)

RealtimeVoiceChannel Configuration

channel = RealtimeVoiceChannel(
    channel_id="voice-realtime",
    provider=provider,                    # RealtimeVoiceProvider
    transport=transport,                  # VoiceBackend
    system_prompt="...",                  # AI instructions
    voice="alloy",                        # Provider voice preset
    tools=[my_tool],                      # Tool objects — definitions + handlers extracted automatically
    temperature=0.7,                      # Generation temperature
    input_sample_rate=16000,              # Audio from client
    output_sample_rate=24000,             # Audio to provider
    transport_sample_rate=None,           # Transport rate (auto-resamples if different)
    emit_transcription_events=True,       # Emit transcriptions as RoomEvents
    tool_handler=my_tool_handler,         # Optional — for MCP, auditing, or custom dispatch
    mute_on_tool_call=False,              # Mute mic during tool execution
    tool_result_max_length=16384,         # Truncate large tool results
)
Parameter Default Description
provider required Realtime AI provider (OpenAI, Gemini, xAI Grok, ElevenLabs)
transport required Audio transport backend
system_prompt None AI system instructions
voice None Voice preset name
tools None Tool objects or JSON definitions — when Tool objects are passed, definitions and handlers are extracted automatically
temperature None Sampling temperature
input_sample_rate 16000 Client → provider sample rate
output_sample_rate 24000 Provider → client sample rate
transport_sample_rate None Transport rate; auto-resamples if mismatched
emit_transcription_events True Create RoomEvents from transcriptions
tool_handler None async (name: str, args: dict) -> str — optional when Tool objects are passed via tools
mute_on_tool_call False Mute mic during tool execution
tool_result_max_length 16384 Max chars for tool results

OpenAI Realtime API

WebSocket-based speech-to-speech with server-side VAD.

from __future__ import annotations

from roomkit.voice.realtime.providers.openai import OpenAIRealtimeProvider

provider = OpenAIRealtimeProvider(
    api_key="sk-...",
    model="gpt-4o-realtime-preview",
    base_url=None,                         # Custom endpoint (optional)
)

VAD Configuration

Pass VAD settings via provider_config in the channel or session:

# Server VAD (default)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,
            "silence_duration_ms": 800,
            "prefix_padding_ms": 300,
            "interrupt_response": True,
            "create_response": True,
        },
    },
)

# Semantic VAD (understands conversation flow)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={
        "turn_detection": {
            "type": "semantic_vad",
            "eagerness": "high",           # low, medium, high, auto
        },
    },
)

# Manual turn management (no automatic VAD)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={"turn_detection": None},
)

Available Voices

alloy, echo, shimmer, breeze, cinnamon, juniper, sage (varies by model)

Input Transcription

# Configure which model transcribes user input
provider_config={
    "stt_model": "gpt-4o-transcribe",
}

Google Gemini Live

Persistent streaming connection with session resumption and advanced features.

from __future__ import annotations

from roomkit.voice.realtime.providers.gemini import GeminiLiveProvider

provider = GeminiLiveProvider(
    api_key="your-gemini-key",
    model="gemini-2.5-flash-native-audio-preview-12-2025",
)

Advanced Configuration

Gemini supports several unique features via provider_config:

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    voice="Aoede",
    provider_config={
        # VAD sensitivity
        "start_of_speech_sensitivity": "HIGH",     # LOW, MEDIUM, HIGH
        "end_of_speech_sensitivity": "MEDIUM",
        "silence_duration_ms": 500,

        # Proactive audio (AI speaks without prompt)
        "proactive_audio": True,

        # Affective dialog (emotional responses)
        "enable_affective_dialog": True,

        # Extended thinking
        "thinking_budget": 1024,

        # Generation parameters
        "top_p": 0.8,
        "top_k": 40,
        "max_output_tokens": 2048,

        # Non-interruptible mode
        "activity_handling": "NO_INTERRUPTION",

        # Language
        "language_code": "en-US",
    },
)
Feature Description
Proactive audio AI can initiate speech without user prompt
Affective dialog Emotional, expressive responses
Thinking budget Extended reasoning before responding
Session resumption Preserves context across reconfiguration
Non-interruptible Prevent user barge-in during responses

Available Voices

Aoede, Fenrir, Kore, Pax, Breeze, Charon, Ember, Orion, Stella, and more.

Session Resumption

Gemini preserves conversation context when reconfigured — useful for agent handoff:

# Start with general assistant
session = await channel.start_session(room_id, participant_id)

# Hand off to specialist — context is preserved
await channel.reconfigure_session(
    session.id,
    system_prompt="You are a billing specialist.",
    voice="Kore",
    tools=billing_tools,
)

xAI Grok Realtime

WebSocket-based speech-to-speech using xAI's Grok models with server-side VAD, built-in transcription, and native web/X search tools.

from __future__ import annotations

from roomkit.providers.xai.config import XAIRealtimeConfig
from roomkit.providers.xai.realtime import XAIRealtimeProvider

provider = XAIRealtimeProvider(
    XAIRealtimeConfig(
        api_key="xai-...",
        model="grok-3-fast",                   # Default model
        voice="eve",                           # Default voice
        transcription_model="grok-2-audio",    # Input transcription model
    )
)

# Or with keyword arguments:
provider = XAIRealtimeProvider(
    api_key="xai-...",
    model="grok-3-fast",
)

VAD Configuration

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={
        "turn_detection_type": "server_vad",    # server_vad (default)
        "threshold": 0.5,
        "silence_duration_ms": 800,
        "prefix_padding_ms": 300,
    },
)

Available Voices

eve, ara, rex, sal, leo

Native Tools

xAI supports native web_search and x_search tools alongside standard function tools:

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    tools=[
        {"type": "web_search"},                # Search the web
        {"type": "x_search"},                  # Search X (Twitter)
        weather_tool,                          # Standard function tool
    ],
)

Input Transcription

# Override transcription model via provider_config
provider_config={
    "transcription_model": "grok-2-audio",
}

Environment Variables (Example)

XAI_API_KEY=xai-...  uv run python examples/realtime_voice_local_xai.py
XAI_MODEL=grok-3-fast  # Model override
XAI_VOICE=ara           # Voice override

ElevenLabs Conversational AI

Server-orchestrated speech-to-speech using ElevenLabs agents. STT, LLM, TTS, VAD, and turn-taking are all handled server-side — the provider just sends/receives audio and handles client tool calls.

from __future__ import annotations

from roomkit.providers.elevenlabs.config import ElevenLabsRealtimeConfig
from roomkit.providers.elevenlabs.realtime import ElevenLabsRealtimeProvider

config = ElevenLabsRealtimeConfig(
    api_key="xi-...",
    agent_id="agent_abc123",           # From ElevenLabs dashboard
)
provider = ElevenLabsRealtimeProvider(config)

Agent Setup

ElevenLabs agents are pre-configured on the ElevenLabs dashboard with an LLM, voice, knowledge base, and tools. The agent_id identifies which agent to connect to. Runtime overrides for system prompt, voice, and temperature are applied at connection time.

Configuration Overrides

Override agent defaults via channel parameters. Provider-specific settings (language, first message, dynamic variables) are passed via session metadata["provider_config"]:

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    system_prompt="You are a helpful assistant.",   # Override agent prompt
    voice="voice-id-from-elevenlabs",               # Override agent voice
    temperature=0.7,                                # Override LLM temperature
)

# Provider-specific overrides via session metadata
session = await channel.start_session(
    room_id="room-1",
    participant_id="user-1",
    metadata={
        "provider_config": {
            "language": "fr",                       # Language code
            "first_message": "Bonjour!",            # Agent's opening message
            "dynamic_variables": {                  # Template variables for the prompt
                "user_name": "Alice",
                "account_id": "12345",
            },
        },
    },
)
Parameter Where Description
system_prompt Channel Override the agent's system prompt
voice Channel ElevenLabs voice ID (overrides agent default)
temperature Channel LLM sampling temperature
language metadata["provider_config"] Language code (e.g. en, fr, ja, es)
first_message metadata["provider_config"] Agent greeting message
dynamic_variables metadata["provider_config"] Dict of variables for prompt templates

Authentication

Two authentication modes:

# Direct API key (server-to-server) — default
config = ElevenLabsRealtimeConfig(
    api_key="xi-...",
    agent_id="agent_abc123",
    requires_auth=False,              # API key sent as header
)

# Signed URL (client-facing deployments)
config = ElevenLabsRealtimeConfig(
    api_key="xi-...",
    agent_id="agent_abc123",
    requires_auth=True,               # Fetches signed URL via SDK
)

Regional Endpoints

# EU (GDPR)
config = ElevenLabsRealtimeConfig(
    api_key="xi-...",
    agent_id="agent_abc123",
    base_url="wss://api.eu.residency.elevenlabs.io",
)

Tool Calling

Tools are configured on the ElevenLabs dashboard as client tools. The agent invokes them, and the provider dispatches via the standard on_tool_call callback:

async def handle_tool(name: str, arguments: dict) -> str:
    if name == "check_order":
        return json.dumps({"status": "shipped", "eta": "Tomorrow"})
    return json.dumps({"error": f"Unknown tool: {name}"})


channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    tool_handler=handle_tool,
)

Tools are dashboard-configured

Unlike OpenAI and Gemini, ElevenLabs tool definitions are set on the agent dashboard — not passed at connection time. The tools parameter on RealtimeVoiceChannel is ignored for ElevenLabs. Only the tool_handler callback matters.

Audio Format

ElevenLabs uses 16-bit PCM mono at 16 kHz by default. The format is negotiated at connection time and reported in the server's init metadata.

Supported formats: pcm_8000, pcm_16000, pcm_22050, pcm_24000, pcm_44100, pcm_48000, ulaw_8000.

Environment Variables (Example)

ELEVENLABS_API_KEY=xi-...  ELEVENLABS_AGENT_ID=agent_abc123 \
    uv run python examples/realtime_voice_local_elevenlabs.py

# Optional overrides
ELEVENLABS_VOICE_ID=voice-id    # Voice override
SYSTEM_PROMPT="Be concise."     # System prompt override
LANGUAGE=fr                     # Language code

Audio Transports

Realtime channels need a transport to carry audio between the client and server.

WebSocket Transport

from __future__ import annotations

from roomkit.voice.realtime import WebSocketRealtimeTransport

transport = WebSocketRealtimeTransport(
    authenticate=my_auth_callback,             # Optional auth
    audio_format="base64_json",                # or "binary"
)

Client messages:

{"type": "audio", "data": "<base64 PCM>"}

Server messages:

{"type": "audio", "data": "<base64>"}
{"type": "transcription", "text": "Hello", "role": "user", "is_final": true}
{"type": "speaking", "speaking": true, "who": "assistant"}
{"type": "clear_audio"}

FastRTC WebRTC Transport

Browser-based WebRTC with low latency:

from __future__ import annotations

from roomkit.voice.realtime import FastRTCRealtimeTransport, mount_fastrtc_realtime

transport = FastRTCRealtimeTransport(
    input_sample_rate=16000,
    output_sample_rate=24000,
)

# Mount on FastAPI app
mount_fastrtc_realtime(
    app,
    transport,
    path="/rtc-realtime",
    auth=my_auth_callback,                     # Optional
)

Audio codec: mu-law (8-bit) over WebRTC DataChannel.

Auto-session: Clients connect via WebRTC, transport fires on_client_connected callback.

SIP Transport

Bridge SIP calls to realtime AI:

from __future__ import annotations

from roomkit.voice.realtime import SIPRealtimeTransport

transport = SIPRealtimeTransport(backend=sip_backend)

Sample rate: Negotiated via SIP codec (G.711 @ 8kHz, G.722 @ 16kHz). The channel auto-resamples.

Audio pacing: Built-in OutboundAudioPacer with ~80ms pre-buffer and jitter absorption.

Local Audio (Development)

Use system mic/speakers for testing:

from __future__ import annotations

from roomkit.voice.backends.local import LocalAudioBackend

transport = LocalAudioBackend(
    sample_rate=16000,
    channels=1,
)

Tool Calling

Pass Tool objects directly — the channel extracts definitions and handlers automatically:

from __future__ import annotations

import json

from roomkit import Tool
from roomkit.channels import RealtimeVoiceChannel


async def get_weather(city: str) -> str:
    return json.dumps({"temperature": 72, "condition": "sunny", "city": city})


weather_tool = Tool(
    name="get_weather",
    description="Get current weather for a city",
    parameters={
        "type": "object",
        "properties": {
            "city": {"type": "string", "description": "City name"},
        },
        "required": ["city"],
    },
    handler=get_weather,
)

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    tools=[weather_tool],
    mute_on_tool_call=True,          # Prevent barge-in during tool execution
)

Via tool_handler Callback (Advanced)

For MCP integration, auditing wrappers, or custom dispatch logic, use tool_handler directly:

async def handle_tool(name, arguments):
    if name == "get_weather":
        city = arguments.get("city", "Unknown")
        return json.dumps({"temperature": 72, "condition": "sunny", "city": city})
    return json.dumps({"error": f"Unknown tool: {name}"})


channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    tools=[
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get current weather for a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "City name"},
                },
                "required": ["city"],
            },
        },
    ],
    tool_handler=handle_tool,
    mute_on_tool_call=True,
)

Via Hooks

The unified ON_TOOL_CALL hook fires from both AIChannel and RealtimeVoiceChannel. Use event.channel_type to distinguish the source. Return the result via HookResult.metadata["result"].

from __future__ import annotations

from roomkit import HookTrigger, RoomKit
from roomkit.models.hook import HookResult

kit = RoomKit()


@kit.hook(HookTrigger.ON_TOOL_CALL)
async def on_tool_call(event, ctx):
    if event.name == "get_weather":
        result = await fetch_weather(event.arguments["city"])
        return HookResult(action="allow", metadata={"result": result})
    return HookResult.allow()

Mute on Tool Call

Set mute_on_tool_call=True when using Gemini Live — it prevents the user from speaking during tool execution, which can cause Gemini to drop tool results.


Session Lifecycle

from __future__ import annotations

from roomkit.channels import RealtimeVoiceChannel

# Start a session
session = await channel.start_session(
    room_id="room-1",
    participant_id="user-1",
    connection=webrtc_id,                      # Transport-specific connection handle
    metadata={
        "system_prompt": "Override prompt",     # Per-session override
        "voice": "echo",                        # Per-session voice
    },
)

# Reconfigure mid-conversation
await channel.reconfigure_session(
    session.id,
    system_prompt="New prompt",
    voice="shimmer",
    tools=new_tools,
    temperature=0.5,
)

# End session
await channel.end_session(session.id)

State machine: CONNECTING → ACTIVE → ENDED

Auto-resampling: If transport_sample_rate differs from input_sample_rate or output_sample_rate, per-session resamplers are created automatically.


Hooks

Realtime-specific hooks fired during voice sessions:

Hook Type Description
ON_TRANSCRIPTION Sync Transcription received (can block/modify)
ON_TOOL_CALL Sync Tool call from any channel (return result via metadata)
ON_SPEECH_START Async User started speaking
ON_SPEECH_END Async User stopped speaking
ON_SESSION_STARTED Async Voice session activated
ON_INPUT_AUDIO_LEVEL Async Input audio level (~10/sec)
ON_OUTPUT_AUDIO_LEVEL Async Output audio level (~10/sec)
from __future__ import annotations

from roomkit import HookTrigger, RoomKit

kit = RoomKit()


@kit.hook(HookTrigger.ON_TRANSCRIPTION)
async def on_transcription(event, ctx):
    print(f"[{event.metadata['role']}] {event.content.body}")


@kit.hook(HookTrigger.ON_SPEECH_START)
async def on_speech_start(event, ctx):
    print("User started speaking")

Access Control

The channel enforces ChannelBinding permissions (RFC Section 7.5):

  • Access revoked: Audio from the client is silently dropped
  • Muted: Audio from the client is silently dropped
  • Active: Audio flows normally

Permissions are checked on every audio frame — changes take effect immediately.


Complete Example: FastRTC + Gemini

from __future__ import annotations

import json

from fastapi import FastAPI

from roomkit import RoomKit, Tool
from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.realtime import FastRTCRealtimeTransport, mount_fastrtc_realtime
from roomkit.voice.realtime.providers.gemini import GeminiLiveProvider

app = FastAPI()
kit = RoomKit()

provider = GeminiLiveProvider(
    api_key="your-gemini-key",
    model="gemini-2.5-flash-native-audio-preview-12-2025",
)

transport = FastRTCRealtimeTransport(
    input_sample_rate=16000,
    output_sample_rate=24000,
)


async def lookup_order(order_id: str) -> str:
    return json.dumps({"status": "shipped", "eta": "Tomorrow"})


order_tool = Tool(
    name="lookup_order",
    description="Look up an order by ID",
    parameters={
        "type": "object",
        "properties": {"order_id": {"type": "string"}},
        "required": ["order_id"],
    },
    handler=lookup_order,
)

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    system_prompt="You are a customer service agent. Be helpful and concise.",
    voice="Aoede",
    tools=[order_tool],
    mute_on_tool_call=True,
    emit_transcription_events=True,
)

kit.register_channel(channel)

# Auto-create session on WebRTC connect
transport.on_client_connected = lambda webrtc_id: channel.start_session(
    room_id="room-1",
    participant_id="caller",
    connection=webrtc_id,
)

mount_fastrtc_realtime(app, transport, path="/rtc")

Testing with Mocks

from __future__ import annotations

from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.realtime.mock import MockRealtimeProvider, MockRealtimeTransport

provider = MockRealtimeProvider()
transport = MockRealtimeTransport()

channel = RealtimeVoiceChannel("voice-test", provider=provider, transport=transport)

session = await channel.start_session("room-1", "user-1")

# Simulate provider events
await provider.simulate_transcription(session, "Hello", role="user", is_final=True)
await provider.simulate_audio(session, b"\x00\x01" * 100)
await provider.simulate_tool_call(session, "call-1", "get_weather", {"city": "NYC"})

# Assert transport received audio
assert len(transport.sent_audio) > 0