Lunos logoLunos

Multimodal Audio

Lunos supports two audio workflows:

  • Audio input: send audio to models for transcription, analysis, or extraction
  • Audio output: request spoken responses from models that support audio output

Endpoint

POST /v1/chat/completions

Authentication

Authorization: Bearer YOUR_SECRET_KEY
Content-Type: application/json

Audio input

Use input_audio in messages[].content[].

Audio data must be base64. Direct audio URLs are not supported in this format.

Content shape:

{
  "type": "input_audio",
  "input_audio": {
    "data": "<BASE64_AUDIO_DATA>",
    "format": "wav"
  }
}

Send audio input

cURL

curl -X POST "https://api.lunos.tech/v1/chat/completions" \
  -H "Authorization: Bearer YOUR_SECRET_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "google/gemini-2.5-flash",
    "messages": [
      {
        "role": "user",
        "content": [
          { "type": "text", "text": "Please transcribe this audio file." },
          {
            "type": "input_audio",
            "input_audio": {
              "data": "<BASE64_AUDIO_DATA>",
              "format": "wav"
            }
          }
        ]
      }
    ]
  }'

Python

import base64
import requests

with open("audio.wav", "rb") as f:
    b64_audio = base64.b64encode(f.read()).decode("utf-8")

payload = {
    "model": "google/gemini-2.5-flash",
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Please transcribe this audio file."},
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": b64_audio,
                        "format": "wav",
                    },
                },
            ],
        }
    ],
}
response = requests.post(
    "https://api.lunos.tech/v1/chat/completions",
    headers={
        "Authorization": "Bearer YOUR_SECRET_KEY",
        "Content-Type": "application/json",
    },
    json=payload,
)
print(response.json())
import fs from "node:fs/promises";

const audioBytes = await fs.readFile("audio.wav");
const base64Audio = audioBytes.toString("base64");

const response = await fetch("https://api.lunos.tech/v1/chat/completions", {
  method: "POST",
  headers: {
    Authorization: "Bearer YOUR_SECRET_KEY",
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    model: "google/gemini-2.5-flash",
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: "Please transcribe this audio file." },
          {
            type: "input_audio",
            input_audio: {
              data: base64Audio,
              format: "wav",
            },
          },
        ],
      },
    ],
  }),
});
console.log(await response.json());

Common input formats

Supported formats depend on provider/model. Common values: wav, mp3, aiff, aac, ogg, flac, m4a, pcm16, pcm24.

Audio output

To receive spoken output, set:

  • modalities: ["text", "audio"]
  • audio config (voice, format)
  • stream: true

Request audio output

cURL

curl -N -X POST "https://api.lunos.tech/v1/chat/completions" \
  -H "Authorization: Bearer YOUR_SECRET_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-4o-audio-preview",
    "messages": [
      { "role": "user", "content": "Say hello in a friendly tone." }
    ],
    "modalities": ["text", "audio"],
    "audio": {
      "voice": "alloy",
      "format": "wav"
    },
    "stream": true
  }'
import base64
import json
import requests

payload = {
    "model": "openai/gpt-4o-audio-preview",
    "messages": [{"role": "user", "content": "Say hello in a friendly tone."}],
    "modalities": ["text", "audio"],
    "audio": {"voice": "alloy", "format": "wav"},
    "stream": True,
}
response = requests.post(
    "https://api.lunos.tech/v1/chat/completions",
    headers={
        "Authorization": "Bearer YOUR_SECRET_KEY",
        "Content-Type": "application/json",
    },
    json=payload,
    stream=True,
)

audio_chunks = []
transcript_chunks = []

for line in response.iter_lines():
    if not line:
        continue
    decoded = line.decode("utf-8")
    if not decoded.startswith("data: "):
        continue
    data = decoded[6:]
    if data.strip() == "[DONE]":
        break
    chunk = json.loads(data)
    audio = chunk.get("choices", [{}])[0].get("delta", {}).get("audio", {})
    if audio.get("data"):
        audio_chunks.append(audio["data"])
    if audio.get("transcript"):
        transcript_chunks.append(audio["transcript"])

transcript = "".join(transcript_chunks)
audio_bytes = base64.b64decode("".join(audio_chunks))
with open("output.wav", "wb") as f:
    f.write(audio_bytes)
print(transcript)

TypeScript

const response = await fetch("https://api.lunos.tech/v1/chat/completions", {
  method: "POST",
  headers: {
    Authorization: "Bearer YOUR_SECRET_KEY",
    "Content-Type": "application/json",
  },
  body: JSON.stringify({
    model: "openai/gpt-4o-audio-preview",
    messages: [{ role: "user", content: "Say hello in a friendly tone." }],
    modalities: ["text", "audio"],
    audio: {
      voice: "alloy",
      format: "wav",
    },
    stream: true,
  }),
});

const reader = response.body?.getReader();
const decoder = new TextDecoder();
const audioChunks: string[] = [];
const transcriptChunks: string[] = [];

if (reader) {
  let buffer = "";
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    buffer += decoder.decode(value, { stream: true });
    const lines = buffer.split("\n");
    buffer = lines.pop() ?? "";
    for (const line of lines) {
      if (!line.startsWith("data: ")) continue;
      const data = line.slice(6).trim();
      if (data === "[DONE]") continue;
      try {
        const chunk = JSON.parse(data);
        const audio = chunk?.choices?.[0]?.delta?.audio;
        if (audio?.data) audioChunks.push(audio.data);
        if (audio?.transcript) transcriptChunks.push(audio.transcript);
      } catch {}
    }
  }
}

const transcript = transcriptChunks.join("");
const fullAudioB64 = audioChunks.join("");
console.log(transcript, fullAudioB64.slice(0, 80));

Streaming chunk format

When requesting audio output, chunks usually contain:

{
  "choices": [
    {
      "delta": {
        "audio": {
          "data": "<base64-audio-chunk>",
          "transcript": "Hello"
        }
      }
    }
  ]
}

Audio output config

Option Meaning
voice Voice preset for output speech (model-dependent)
format Output audio format (for example wav, mp3, flac, opus, pcm16)

Lunos checklist

  • Verify model capability in inputModalities / outputModalities
  • Use base64 for audio input (no direct audio URL)
  • Validate format and duration before sending
  • Use streaming parser for audio output, then reconstruct audio bytes safely