stream in real time

low-latency playback over a WebSocket session.

for real-time playback, mint a one-shot WebSocket session, connect to it, and send a JSON frame with your synthesis parameters. the server streams raw PCM int16 little-endian @ 24 kHz mono as binary frames, then a terminal {"type":"done"} JSON text frame.

mint a session
POST /v1/tts/ws-connect returns { ws_url, token }.
connect and send
connect to ws_url?token=<token> and send one JSON synthesis frame.
collect PCM
read binary PCM chunks until the done (or error) control frame.

import asyncio, json, wave, requests, websockets

API_KEY = "rk_live_•••••••••"
BASE = "https://silk-api.rumik.ai"

async def main():
    # 1. Mint a one-shot WS session -> { ws_url, token }
    s = requests.post(f"{BASE}/v1/tts/ws-connect",
                      headers={"Authorization": f"Bearer {API_KEY}"},
                      json={"model": "mulberry", "text": "Streaming in real time."}).json()

    # 2. Connect, then send the synthesis frame
    async with websockets.connect(f'{s["ws_url"]}?token={s["token"]}') as ws:
        await ws.send(json.dumps({
            "text": "Streaming in real time.",
            "description": "calm female narrator",
            "speaker": "speaker_1",   # mulberry only; omit for muga / the described voice
            "f0_up_key": 0,
        }))

        # 3. Collect PCM int16 (24 kHz mono) until the done frame
        pcm = bytearray()
        async for msg in ws:
            if isinstance(msg, bytes):
                pcm.extend(msg)
            elif json.loads(msg).get("type") == "done":
                break

    with wave.open("stream.wav", "wb") as w:
        w.setnchannels(1); w.setsampwidth(2); w.setframerate(24000)
        w.writeframes(pcm)

asyncio.run(main())

<!doctype html>
<html>
  <body>
    <button id="play">Speak</button>
    <script>
      const API_KEY = "rk_live_•••••••••";   // use a key with the tts:stream scope
      const BASE = "https://silk-api.rumik.ai";

      document.getElementById("play").onclick = async () => {
        // 1. Mint a one-shot WebSocket session -> { ws_url, token }
        const res = await fetch(BASE + "/v1/tts/ws-connect", {
          method: "POST",
          headers: { "Authorization": "Bearer " + API_KEY, "Content-Type": "application/json" },
          body: JSON.stringify({ model: "mulberry", text: "Hello from the browser." }),
        });
        const { ws_url, token } = await res.json();

        // 2. Set up 24 kHz mono playback
        const ctx = new AudioContext({ sampleRate: 24000 });
        let playAt = ctx.currentTime;

        // 3. Connect, send the synthesis frame, queue PCM as it arrives
        const ws = new WebSocket(ws_url + "?token=" + encodeURIComponent(token));
        ws.binaryType = "arraybuffer";

        ws.onopen = () => ws.send(JSON.stringify({
          text: "Hello from the browser.",
          description: "warm, friendly narrator",
          speaker: "speaker_1",   // mulberry only; omit for muga / the described voice
          f0_up_key: 0,           // pitch shift in semitones (-12..12)
        }));

        ws.onmessage = (e) => {
          if (e.data instanceof ArrayBuffer) {
            const pcm = new Int16Array(e.data);
            const buf = ctx.createBuffer(1, pcm.length, 24000);
            const ch = buf.getChannelData(0);
            for (let i = 0; i < pcm.length; i++) ch[i] = pcm[i] / 32768;
            const src = ctx.createBufferSource();
            src.buffer = buf;
            src.connect(ctx.destination);
            playAt = Math.max(playAt, ctx.currentTime);
            src.start(playAt);
            playAt += buf.duration;
          } else if (JSON.parse(e.data).type === "done" || JSON.parse(e.data).error) {
            ws.close();
          }
        };
      };
    </script>
  </body>
</html>