stream in real time
low-latency playback over a WebSocket session.
for real-time playback, mint a one-shot WebSocket session, connect to it, and send a JSON frame with your synthesis parameters. the server streams raw PCM int16 little-endian @ 24 kHz mono as binary frames, then a terminal {"type":"done"} JSON text frame.
- mint a session
POST /v1/tts/ws-connectreturns{ ws_url, token }. - connect and send
connect to
ws_url?token=<token>and send one JSON synthesis frame. - collect PCM
read binary PCM chunks until the
done(orerror) control frame.
import asyncio, json, wave, requests, websockets
API_KEY = "rk_live_•••••••••"
BASE = "https://silk-api.rumik.ai"
async def main():
# 1. Mint a one-shot WS session -> { ws_url, token }
s = requests.post(f"{BASE}/v1/tts/ws-connect",
headers={"Authorization": f"Bearer {API_KEY}"},
json={"model": "mulberry", "text": "Streaming in real time."}).json()
# 2. Connect, then send the synthesis frame
async with websockets.connect(f'{s["ws_url"]}?token={s["token"]}') as ws:
await ws.send(json.dumps({
"text": "Streaming in real time.",
"description": "calm female narrator",
"speaker": "speaker_1", # mulberry only; omit for muga / the described voice
"f0_up_key": 0,
}))
# 3. Collect PCM int16 (24 kHz mono) until the done frame
pcm = bytearray()
async for msg in ws:
if isinstance(msg, bytes):
pcm.extend(msg)
elif json.loads(msg).get("type") == "done":
break
with wave.open("stream.wav", "wb") as w:
w.setnchannels(1); w.setsampwidth(2); w.setframerate(24000)
w.writeframes(pcm)
asyncio.run(main())<!doctype html>
<html>
<body>
<button id="play">Speak</button>
<script>
const API_KEY = "rk_live_•••••••••"; // use a key with the tts:stream scope
const BASE = "https://silk-api.rumik.ai";
document.getElementById("play").onclick = async () => {
// 1. Mint a one-shot WebSocket session -> { ws_url, token }
const res = await fetch(BASE + "/v1/tts/ws-connect", {
method: "POST",
headers: { "Authorization": "Bearer " + API_KEY, "Content-Type": "application/json" },
body: JSON.stringify({ model: "mulberry", text: "Hello from the browser." }),
});
const { ws_url, token } = await res.json();
// 2. Set up 24 kHz mono playback
const ctx = new AudioContext({ sampleRate: 24000 });
let playAt = ctx.currentTime;
// 3. Connect, send the synthesis frame, queue PCM as it arrives
const ws = new WebSocket(ws_url + "?token=" + encodeURIComponent(token));
ws.binaryType = "arraybuffer";
ws.onopen = () => ws.send(JSON.stringify({
text: "Hello from the browser.",
description: "warm, friendly narrator",
speaker: "speaker_1", // mulberry only; omit for muga / the described voice
f0_up_key: 0, // pitch shift in semitones (-12..12)
}));
ws.onmessage = (e) => {
if (e.data instanceof ArrayBuffer) {
const pcm = new Int16Array(e.data);
const buf = ctx.createBuffer(1, pcm.length, 24000);
const ch = buf.getChannelData(0);
for (let i = 0; i < pcm.length; i++) ch[i] = pcm[i] / 32768;
const src = ctx.createBufferSource();
src.buffer = buf;
src.connect(ctx.destination);
playAt = Math.max(playAt, ctx.currentTime);
src.start(playAt);
playAt += buf.duration;
} else if (JSON.parse(e.data).type === "done" || JSON.parse(e.data).error) {
ws.close();
}
};
};
</script>
</body>
</html>