Skip to content

Realtime Voice (Speech-to-Speech)

RoomKit's RealtimeVoiceChannel enables speech-to-speech AI conversations using providers like OpenAI Realtime API and Google Gemini Live. Audio flows directly between the client and the AI provider — no separate STT/TTS stages.

How It Differs from VoiceChannel

Aspect VoiceChannel RealtimeVoiceChannel
Audio flow Client → STT → AI → TTS → Client Client ↔ Provider (direct)
VAD Client-side (AudioPipeline) Server-side (provider)
Transcription Generated by STT provider Provided by realtime provider
Tool calling Via AIChannel + hooks Direct provider callbacks
Latency Higher (STT + AI + TTS) Lower (single round-trip)

Quick Start

from __future__ import annotations

from roomkit import RoomKit
from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.backends.mock import MockVoiceBackend
from roomkit.voice.realtime.providers.openai import OpenAIRealtimeProvider

provider = OpenAIRealtimeProvider(
    api_key="sk-...",
    model="gpt-4o-realtime-preview",
)

transport = MockVoiceBackend()

channel = RealtimeVoiceChannel(
    "voice-realtime",
    provider=provider,
    transport=transport,
    system_prompt="You are a helpful voice assistant.",
    voice="alloy",
    input_sample_rate=16000,
    output_sample_rate=24000,
    emit_transcription_events=True,
)

kit = RoomKit()
kit.register_channel(channel)

RealtimeVoiceChannel Configuration

channel = RealtimeVoiceChannel(
    channel_id="voice-realtime",
    provider=provider,                    # RealtimeVoiceProvider
    transport=transport,                  # VoiceBackend
    system_prompt="...",                  # AI instructions
    voice="alloy",                        # Provider voice preset
    tools=[...],                          # Tool definitions (JSON schema)
    temperature=0.7,                      # Generation temperature
    input_sample_rate=16000,              # Audio from client
    output_sample_rate=24000,             # Audio to provider
    transport_sample_rate=None,           # Transport rate (auto-resamples if different)
    emit_transcription_events=True,       # Emit transcriptions as RoomEvents
    tool_handler=my_tool_handler,         # Tool execution callback
    mute_on_tool_call=False,              # Mute mic during tool execution
    tool_result_max_length=16384,         # Truncate large tool results
)
Parameter Default Description
provider required Realtime AI provider (OpenAI, Gemini)
transport required Audio transport backend
system_prompt None AI system instructions
voice None Voice preset name
tools None Tool definitions for function calling
temperature None Sampling temperature
input_sample_rate 16000 Client → provider sample rate
output_sample_rate 24000 Provider → client sample rate
transport_sample_rate None Transport rate; auto-resamples if mismatched
emit_transcription_events True Create RoomEvents from transcriptions
tool_handler None async (session, name, args) -> result
mute_on_tool_call False Mute mic during tool execution
tool_result_max_length 16384 Max chars for tool results

OpenAI Realtime API

WebSocket-based speech-to-speech with server-side VAD.

from __future__ import annotations

from roomkit.voice.realtime.providers.openai import OpenAIRealtimeProvider

provider = OpenAIRealtimeProvider(
    api_key="sk-...",
    model="gpt-4o-realtime-preview",
    base_url=None,                         # Custom endpoint (optional)
)

VAD Configuration

Pass VAD settings via provider_config in the channel or session:

# Server VAD (default)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,
            "silence_duration_ms": 800,
            "prefix_padding_ms": 300,
            "interrupt_response": True,
            "create_response": True,
        },
    },
)

# Semantic VAD (understands conversation flow)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={
        "turn_detection": {
            "type": "semantic_vad",
            "eagerness": "high",           # low, medium, high, auto
        },
    },
)

# Manual turn management (no automatic VAD)
channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    provider_config={"turn_detection": None},
)

Available Voices

alloy, echo, shimmer, breeze, cinnamon, juniper, sage (varies by model)

Input Transcription

# Configure which model transcribes user input
provider_config={
    "stt_model": "gpt-4o-transcribe",
}

Google Gemini Live

Persistent streaming connection with session resumption and advanced features.

from __future__ import annotations

from roomkit.voice.realtime.providers.gemini import GeminiLiveProvider

provider = GeminiLiveProvider(
    api_key="your-gemini-key",
    model="gemini-2.5-flash-native-audio-preview-12-2025",
)

Advanced Configuration

Gemini supports several unique features via provider_config:

channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    voice="Aoede",
    provider_config={
        # VAD sensitivity
        "start_of_speech_sensitivity": "HIGH",     # LOW, MEDIUM, HIGH
        "end_of_speech_sensitivity": "MEDIUM",
        "silence_duration_ms": 500,

        # Proactive audio (AI speaks without prompt)
        "proactive_audio": True,

        # Affective dialog (emotional responses)
        "enable_affective_dialog": True,

        # Extended thinking
        "thinking_budget": 1024,

        # Generation parameters
        "top_p": 0.8,
        "top_k": 40,
        "max_output_tokens": 2048,

        # Non-interruptible mode
        "activity_handling": "NO_INTERRUPTION",

        # Language
        "language_code": "en-US",
    },
)
Feature Description
Proactive audio AI can initiate speech without user prompt
Affective dialog Emotional, expressive responses
Thinking budget Extended reasoning before responding
Session resumption Preserves context across reconfiguration
Non-interruptible Prevent user barge-in during responses

Available Voices

Aoede, Fenrir, Kore, Pax, Breeze, Charon, Ember, Orion, Stella, and more.

Session Resumption

Gemini preserves conversation context when reconfigured — useful for agent handoff:

# Start with general assistant
session = await channel.start_session(room_id, participant_id)

# Hand off to specialist — context is preserved
await channel.reconfigure_session(
    session.id,
    system_prompt="You are a billing specialist.",
    voice="Kore",
    tools=billing_tools,
)

Audio Transports

Realtime channels need a transport to carry audio between the client and server.

WebSocket Transport

from __future__ import annotations

from roomkit.voice.realtime import WebSocketRealtimeTransport

transport = WebSocketRealtimeTransport(
    authenticate=my_auth_callback,             # Optional auth
    audio_format="base64_json",                # or "binary"
)

Client messages:

{"type": "audio", "data": "<base64 PCM>"}

Server messages:

{"type": "audio", "data": "<base64>"}
{"type": "transcription", "text": "Hello", "role": "user", "is_final": true}
{"type": "speaking", "speaking": true, "who": "assistant"}
{"type": "clear_audio"}

FastRTC WebRTC Transport

Browser-based WebRTC with low latency:

from __future__ import annotations

from roomkit.voice.realtime import FastRTCRealtimeTransport, mount_fastrtc_realtime

transport = FastRTCRealtimeTransport(
    input_sample_rate=16000,
    output_sample_rate=24000,
)

# Mount on FastAPI app
mount_fastrtc_realtime(
    app,
    transport,
    path="/rtc-realtime",
    auth=my_auth_callback,                     # Optional
)

Audio codec: mu-law (8-bit) over WebRTC DataChannel.

Auto-session: Clients connect via WebRTC, transport fires on_client_connected callback.

SIP Transport

Bridge SIP calls to realtime AI:

from __future__ import annotations

from roomkit.voice.realtime import SIPRealtimeTransport

transport = SIPRealtimeTransport(backend=sip_backend)

Sample rate: Negotiated via SIP codec (G.711 @ 8kHz, G.722 @ 16kHz). The channel auto-resamples.

Audio pacing: Built-in OutboundAudioPacer with ~80ms pre-buffer and jitter absorption.

Local Audio (Development)

Use system mic/speakers for testing:

from __future__ import annotations

from roomkit.voice.backends.local import LocalAudioBackend

transport = LocalAudioBackend(
    sample_rate=16000,
    channels=1,
)

Tool Calling

Via tool_handler Callback

from __future__ import annotations

from roomkit.channels import RealtimeVoiceChannel


async def handle_tool(session, name, arguments):
    if name == "get_weather":
        city = arguments.get("city", "Unknown")
        return {"temperature": 72, "condition": "sunny", "city": city}
    elif name == "search_knowledge":
        return {"results": ["Article 1", "Article 2"]}
    return {"error": f"Unknown tool: {name}"}


channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    tools=[
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get current weather for a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "City name"},
                },
                "required": ["city"],
            },
        },
    ],
    tool_handler=handle_tool,
    mute_on_tool_call=True,          # Prevent barge-in during tool execution
)

Via Hooks

from __future__ import annotations

from roomkit import HookTrigger, RoomKit

kit = RoomKit()


@kit.hook(HookTrigger.ON_REALTIME_TOOL_CALL)
async def on_tool_call(event, ctx):
    if event.metadata["name"] == "get_weather":
        result = await fetch_weather(event.metadata["arguments"]["city"])
        event.metadata["result"] = result

Mute on Tool Call

Set mute_on_tool_call=True when using Gemini Live — it prevents the user from speaking during tool execution, which can cause Gemini to drop tool results.


Session Lifecycle

from __future__ import annotations

from roomkit.channels import RealtimeVoiceChannel

# Start a session
session = await channel.start_session(
    room_id="room-1",
    participant_id="user-1",
    connection=webrtc_id,                      # Transport-specific connection handle
    metadata={
        "system_prompt": "Override prompt",     # Per-session override
        "voice": "echo",                        # Per-session voice
    },
)

# Reconfigure mid-conversation
await channel.reconfigure_session(
    session.id,
    system_prompt="New prompt",
    voice="shimmer",
    tools=new_tools,
    temperature=0.5,
)

# End session
await channel.end_session(session.id)

State machine: CONNECTING → ACTIVE → ENDED

Auto-resampling: If transport_sample_rate differs from input_sample_rate or output_sample_rate, per-session resamplers are created automatically.


Hooks

Realtime-specific hooks fired during voice sessions:

Hook Type Description
ON_TRANSCRIPTION Sync Transcription received (can block/modify)
ON_REALTIME_TOOL_CALL Sync Tool call from provider (must return result)
ON_SPEECH_START Async User started speaking
ON_SPEECH_END Async User stopped speaking
ON_SESSION_STARTED Async Voice session activated
ON_INPUT_AUDIO_LEVEL Async Input audio level (~10/sec)
ON_OUTPUT_AUDIO_LEVEL Async Output audio level (~10/sec)
from __future__ import annotations

from roomkit import HookTrigger, RoomKit

kit = RoomKit()


@kit.hook(HookTrigger.ON_TRANSCRIPTION)
async def on_transcription(event, ctx):
    print(f"[{event.metadata['role']}] {event.content.body}")


@kit.hook(HookTrigger.ON_SPEECH_START)
async def on_speech_start(event, ctx):
    print("User started speaking")

Access Control

The channel enforces ChannelBinding permissions (RFC Section 7.5):

  • Access revoked: Audio from the client is silently dropped
  • Muted: Audio from the client is silently dropped
  • Active: Audio flows normally

Permissions are checked on every audio frame — changes take effect immediately.


Complete Example: FastRTC + Gemini

from __future__ import annotations

from fastapi import FastAPI

from roomkit import RoomKit
from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.realtime import FastRTCRealtimeTransport, mount_fastrtc_realtime
from roomkit.voice.realtime.providers.gemini import GeminiLiveProvider

app = FastAPI()
kit = RoomKit()

provider = GeminiLiveProvider(
    api_key="your-gemini-key",
    model="gemini-2.5-flash-native-audio-preview-12-2025",
)

transport = FastRTCRealtimeTransport(
    input_sample_rate=16000,
    output_sample_rate=24000,
)


async def on_tool(session, name, arguments):
    if name == "lookup_order":
        return {"status": "shipped", "eta": "Tomorrow"}
    return {"error": "Unknown tool"}


channel = RealtimeVoiceChannel(
    "voice",
    provider=provider,
    transport=transport,
    system_prompt="You are a customer service agent. Be helpful and concise.",
    voice="Aoede",
    tools=[
        {
            "type": "function",
            "name": "lookup_order",
            "description": "Look up an order by ID",
            "parameters": {
                "type": "object",
                "properties": {"order_id": {"type": "string"}},
                "required": ["order_id"],
            },
        },
    ],
    tool_handler=on_tool,
    mute_on_tool_call=True,
    emit_transcription_events=True,
)

kit.register_channel(channel)

# Auto-create session on WebRTC connect
transport.on_client_connected = lambda webrtc_id: channel.start_session(
    room_id="room-1",
    participant_id="caller",
    connection=webrtc_id,
)

mount_fastrtc_realtime(app, transport, path="/rtc")

Testing with Mocks

from __future__ import annotations

from roomkit.channels import RealtimeVoiceChannel
from roomkit.voice.realtime.mock import MockRealtimeProvider, MockRealtimeTransport

provider = MockRealtimeProvider()
transport = MockRealtimeTransport()

channel = RealtimeVoiceChannel("voice-test", provider=provider, transport=transport)

session = await channel.start_session("room-1", "user-1")

# Simulate provider events
await provider.simulate_transcription(session, "Hello", role="user", is_final=True)
await provider.simulate_audio(session, b"\x00\x01" * 100)
await provider.simulate_tool_call(session, "call-1", "get_weather", {"city": "NYC"})

# Assert transport received audio
assert len(transport.sent_audio) > 0