Meteor AI
API Documentation

Audio API

Speech recognition and synthesis API usage guide with multi-language examples

Audio API

The Audio API provides speech recognition (STT) and text-to-speech (TTS) capabilities, supporting speech-to-text, text-to-speech, and speech translation.

Basic Information

API Endpoints

  • Text-to-speech (TTS): https://api.routin.ai/v1/audio/speech
  • Speech-to-text: https://api.routin.ai/v1/audio/transcriptions
  • Speech translation: https://api.routin.ai/v1/audio/translations

Authentication Add your API Key in the request header:

Authorization: Bearer YOUR_API_KEY

MeteorAI is fully compatible with the OpenAI Audio API, supporting Whisper and TTS models.

1. Text-to-Speech

Convert text to natural speech.

Request Parameters

ParameterTypeRequiredDescription
modelstringYesModel name, such as tts-1, tts-1-hd
inputstringYesText to convert, up to 4096 characters
voicestringYesVoice option: alloy, echo, fable, onyx, nova, shimmer
response_formatstringNoAudio format: mp3, opus, aac, flac, wav, pcm (default mp3)
speednumberNoSpeech speed (0.25-4.0), default 1.0

Voice Options

  • alloy: Neutral, balanced voice
  • echo: Warm, friendly male voice
  • fable: Expressive voice
  • onyx: Deep, authoritative male voice
  • nova: Lively, friendly female voice
  • shimmer: Gentle, soft female voice

Code Examples

from openai import OpenAI
from pathlib import Path

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

# Generate speech
response = client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input="你好!欢迎使用 MeteorAI 的语音合成服务。"
)

# Save audio file
speech_file_path = Path("output.mp3")
response.stream_to_file(speech_file_path)
print(f"Audio saved to: {speech_file_path}")
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const response = await client.audio.speech.create({
    model: 'tts-1',
    voice: 'nova',
    input: '你好!欢迎使用 MeteorAI 的语音合成服务。',
  });

  const buffer = Buffer.from(await response.arrayBuffer());
  await fs.promises.writeFile('output.mp3', buffer);
  console.log('Audio saved to: output.mp3');
}

main();
const OpenAI = require('openai');
const fs = require('fs');

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const response = await client.audio.speech.create({
    model: 'tts-1',
    voice: 'nova',
    input: '你好!欢迎使用 MeteorAI 的语音合成服务。',
  });

  const buffer = Buffer.from(await response.arrayBuffer());
  await fs.promises.writeFile('output.mp3', buffer);
  console.log('Audio saved to: output.mp3');
}

main();
using OpenAI.Audio;

var client = new AudioClient(
    model: "tts-1",
    apiKey: "YOUR_API_KEY",
    new OpenAIClientOptions
    {
        Endpoint = new Uri("https://api.routin.ai/v1")
    }
);

var response = await client.GenerateSpeechAsync(
    "你好!欢迎使用 MeteorAI 的语音合成服务。",
    GeneratedSpeechVoice.Nova
);

using var fileStream = File.OpenWrite("output.mp3");
response.Value.ToStream().CopyTo(fileStream);
Console.WriteLine("Audio saved to: output.mp3");
curl https://api.routin.ai/v1/audio/speech \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "tts-1",
    "input": "你好!欢迎使用 MeteorAI 的语音合成服务。",
    "voice": "nova"
  }' \
  --output output.mp3

Advanced Usage: Streaming Playback

from openai import OpenAI
import pygame
from io import BytesIO

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

response = client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input="这是一段较长的文本,我们将实时播放它。",
    response_format="mp3"
)

# Real-time playback
audio_data = BytesIO(response.content)
pygame.mixer.init()
pygame.mixer.music.load(audio_data)
pygame.mixer.music.play()

while pygame.mixer.music.get_busy():
    pygame.time.Clock().tick(10)
import OpenAI from 'openai';
import { Readable } from 'stream';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const response = await client.audio.speech.create({
    model: 'tts-1',
    voice: 'nova',
    input: '这是一段较长的文本,我们将实时播放它。',
    response_format: 'mp3',
  });

  // Stream processing
  const stream = Readable.from(response.body);
  stream.pipe(process.stdout);
}

main();

2. Speech-to-Text

Convert audio files to text (supports multiple languages).

Request Parameters

ParameterTypeRequiredDescription
filefileYesAudio file (< 25MB), supports mp3, mp4, mpeg, mpga, m4a, wav, webm
modelstringYesModel name, such as whisper-1
languagestringNoISO-639-1 language code, such as zh, en, ja
promptstringNoOptional context or style prompt
response_formatstringNoFormat: json, text, srt, vtt, verbose_json (default json)
temperaturenumberNoSampling temperature (0-1)
timestamp_granularitiesarrayNoTimestamp granularity: word, segment

Code Examples

from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

# Transcribe audio
audio_file = open("speech.mp3", "rb")
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=audio_file,
    language="zh"
)

print(f"Transcription result: {transcription.text}")
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const transcription = await client.audio.transcriptions.create({
    file: fs.createReadStream('speech.mp3'),
    model: 'whisper-1',
    language: 'zh',
  });

  console.log(`Transcription result: ${transcription.text}`);
}

main();
const OpenAI = require('openai');
const fs = require('fs');

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const transcription = await client.audio.transcriptions.create({
    file: fs.createReadStream('speech.mp3'),
    model: 'whisper-1',
    language: 'zh',
  });

  console.log(`Transcription result: ${transcription.text}`);
}

main();
using OpenAI.Audio;

var client = new AudioClient(
    model: "whisper-1",
    apiKey: "YOUR_API_KEY",
    new OpenAIClientOptions
    {
        Endpoint = new Uri("https://api.routin.ai/v1")
    }
);

using var audioStream = File.OpenRead("speech.mp3");

var transcription = await client.TranscribeAudioAsync(
    audioStream,
    "speech.mp3",
    new AudioTranscriptionOptions
    {
        Language = "zh"
    }
);

Console.WriteLine($"Transcription result: {transcription.Value.Text}");
curl https://api.routin.ai/v1/audio/transcriptions \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F file="@speech.mp3" \
  -F model="whisper-1" \
  -F language="zh"

Transcription with Timestamps

from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

audio_file = open("speech.mp3", "rb")
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=audio_file,
    response_format="verbose_json",
    timestamp_granularities=["word", "segment"]
)

print(f"Full text: {transcription.text}")
print(f"\nBy segment:")
for segment in transcription.segments:
    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const transcription = await client.audio.transcriptions.create({
    file: fs.createReadStream('speech.mp3'),
    model: 'whisper-1',
    response_format: 'verbose_json',
    timestamp_granularities: ['word', 'segment'],
  });

  console.log(`Full text: ${transcription.text}`);
  console.log('\nBy segment:');

  transcription.segments?.forEach((segment: any) => {
    console.log(`[${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s] ${segment.text}`);
  });
}

main();
using OpenAI.Audio;

var client = new AudioClient(
    model: "whisper-1",
    apiKey: "YOUR_API_KEY",
    new OpenAIClientOptions
    {
        Endpoint = new Uri("https://api.routin.ai/v1")
    }
);

using var audioStream = File.OpenRead("speech.mp3");

var transcription = await client.TranscribeAudioAsync(
    audioStream,
    "speech.mp3",
    new AudioTranscriptionOptions
    {
        ResponseFormat = AudioTranscriptionFormat.VerboseJson,
        TimestampGranularities = AudioTimestampGranularities.Word | AudioTimestampGranularities.Segment
    }
);

Console.WriteLine($"Full text: {transcription.Value.Text}");
Console.WriteLine("\nBy segment:");

foreach (var segment in transcription.Value.Segments)
{
    Console.WriteLine($"[{segment.Start:F2}s - {segment.End:F2}s] {segment.Text}");
}

Generate Subtitle Files

from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

audio_file = open("speech.mp3", "rb")

# Generate SRT subtitles
srt_transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=audio_file,
    response_format="srt"
)

with open("subtitles.srt", "w", encoding="utf-8") as f:
    f.write(srt_transcription)
print("SRT subtitles saved")

# Generate VTT subtitles
audio_file.seek(0)
vtt_transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=audio_file,
    response_format="vtt"
)

with open("subtitles.vtt", "w", encoding="utf-8") as f:
    f.write(vtt_transcription)
print("VTT subtitles saved")
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  // Generate SRT subtitles
  const srtTranscription = await client.audio.transcriptions.create({
    file: fs.createReadStream('speech.mp3'),
    model: 'whisper-1',
    response_format: 'srt',
  });

  await fs.promises.writeFile('subtitles.srt', srtTranscription);
  console.log('SRT subtitles saved');

  // Generate VTT subtitles
  const vttTranscription = await client.audio.transcriptions.create({
    file: fs.createReadStream('speech.mp3'),
    model: 'whisper-1',
    response_format: 'vtt',
  });

  await fs.promises.writeFile('subtitles.vtt', vttTranscription);
  console.log('VTT subtitles saved');
}

main();

3. Speech Translation

Translate audio from any language to English text.

Request Parameters

ParameterTypeRequiredDescription
filefileYesAudio file (< 25MB)
modelstringYesModel name, such as whisper-1
promptstringNoOptional context prompt
response_formatstringNoFormat: json, text, srt, vtt, verbose_json
temperaturenumberNoSampling temperature (0-1)

The speech translation feature translates audio content to English rather than preserving the original language. To preserve the original language, use the transcription feature.

Code Examples

from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

# Translate audio (e.g., Chinese audio to English)
audio_file = open("chinese_speech.mp3", "rb")
translation = client.audio.translations.create(
    model="whisper-1",
    file=audio_file
)

print(f"Translation result (English): {translation.text}")
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  const translation = await client.audio.translations.create({
    file: fs.createReadStream('chinese_speech.mp3'),
    model: 'whisper-1',
  });

  console.log(`Translation result (English): ${translation.text}`);
}

main();
using OpenAI.Audio;

var client = new AudioClient(
    model: "whisper-1",
    apiKey: "YOUR_API_KEY",
    new OpenAIClientOptions
    {
        Endpoint = new Uri("https://api.routin.ai/v1")
    }
);

using var audioStream = File.OpenRead("chinese_speech.mp3");

var translation = await client.TranslateAudioAsync(
    audioStream,
    "chinese_speech.mp3"
);

Console.WriteLine($"Translation result (English): {translation.Value.Text}");
curl https://api.routin.ai/v1/audio/translations \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F file="@chinese_speech.mp3" \
  -F model="whisper-1"

Supported Languages

The Whisper model supports transcription in multiple languages:

Primarily Supported Languages:

  • Chinese (zh)
  • English (en)
  • Spanish (es)
  • French (fr)
  • German (de)
  • Japanese (ja)
  • Korean (ko)
  • Russian (ru)
  • Portuguese (pt)
  • Italian (it)
  • 90+ languages total

For a complete list of supported languages, refer to the OpenAI Whisper documentation. Translation only supports translating any language to English.

Use Cases

1. Meeting Transcription

# Transcribe meeting recording
meeting_audio = open("meeting.mp3", "rb")
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=meeting_audio,
    language="zh",
    response_format="verbose_json",
    timestamp_granularities=["segment"]
)

# Generate meeting minutes
for segment in transcription.segments:
    print(f"[{segment['start']//60:.0f}:{segment['start']%60:05.2f}] {segment['text']}")

2. Podcast Subtitle Generation

# Generate multilingual subtitles for podcasts
podcast_audio = open("podcast.mp3", "rb")

# Chinese subtitles
zh_subtitles = client.audio.transcriptions.create(
    model="whisper-1",
    file=podcast_audio,
    language="zh",
    response_format="srt"
)

# English translation subtitles
podcast_audio.seek(0)
en_subtitles = client.audio.translations.create(
    model="whisper-1",
    file=podcast_audio,
    response_format="srt"
)

3. Voice Assistant

# Real-time voice interaction
def voice_assistant(audio_file_path):
    # 1. Speech to text
    audio = open(audio_file_path, "rb")
    transcription = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio
    )
    user_input = transcription.text

    # 2. AI processing
    chat_response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": user_input}]
    )
    ai_response = chat_response.choices[0].message.content

    # 3. Text to speech
    speech_response = client.audio.speech.create(
        model="tts-1",
        voice="nova",
        input=ai_response
    )

    return speech_response

4. Customer Service Quality Assurance

# Transcribe and analyze customer service calls
call_audio = open("customer_call.mp3", "rb")
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=call_audio,
    language="zh",
    response_format="verbose_json"
)

# Use AI to analyze call quality
analysis = client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": f"分析以下客服通话质量:\n{transcription.text}"
    }]
)

Best Practices

1. Audio Preprocessing

  • Format: Recommend using MP3, M4A, or WAV format
  • Sample Rate: 16kHz or higher
  • Noise Reduction: Pre-remove background noise to improve accuracy
  • Size: Ensure file is smaller than 25MB

2. Improve Transcription Accuracy

# Use prompts to provide context
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=audio_file,
    language="zh",
    prompt="这是一段关于人工智能技术的讨论,涉及机器学习、深度学习等专业术语。"
)

3. Cost Optimization

# For long audio, process in segments
from pydub import AudioSegment

audio = AudioSegment.from_mp3("long_audio.mp3")
chunk_length_ms = 10 * 60 * 1000  # 10 minutes

chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]

full_transcription = ""
for i, chunk in enumerate(chunks):
    chunk.export(f"chunk_{i}.mp3", format="mp3")
    with open(f"chunk_{i}.mp3", "rb") as f:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=f
        )
        full_transcription += transcription.text + " "

4. Speech Quality

# Choose appropriate TTS model
# tts-1: Faster, lower latency, suitable for real-time scenarios
# tts-1-hd: Better quality, suitable for high-quality content production

# Real-time scenario
client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input=text
)

# High-quality production
client.audio.speech.create(
    model="tts-1-hd",
    voice="nova",
    input=text,
    speed=1.0
)

Error Handling

Audio file size is limited to 25MB. Files exceeding this limit need to be processed in segments.

from openai import OpenAI, APIError
import os

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.routin.ai/v1"
)

try:
    audio_file = open("speech.mp3", "rb")

    # Check file size
    file_size = os.path.getsize("speech.mp3")
    if file_size > 25 * 1024 * 1024:
        raise ValueError("File size exceeds 25MB limit")

    transcription = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file
    )
    print(transcription.text)

except ValueError as e:
    print(f"File error: {e}")
except APIError as e:
    print(f"API error: {e}")
except Exception as e:
    print(f"Unknown error: {e}")
finally:
    audio_file.close()
import OpenAI from 'openai';
import fs from 'fs';

const client = new OpenAI({
  apiKey: 'YOUR_API_KEY',
  baseURL: 'https://api.routin.ai/v1',
});

async function main() {
  try {
    const stats = fs.statSync('speech.mp3');
    if (stats.size > 25 * 1024 * 1024) {
      throw new Error('File size exceeds 25MB limit');
    }

    const transcription = await client.audio.transcriptions.create({
      file: fs.createReadStream('speech.mp3'),
      model: 'whisper-1',
    });

    console.log(transcription.text);
  } catch (error: any) {
    if (error instanceof OpenAI.APIError) {
      console.error(`API error: ${error.message}`);
    } else {
      console.error(`Error: ${error.message}`);
    }
  }
}

main();
using OpenAI.Audio;
using OpenAI;

var client = new AudioClient(
    model: "whisper-1",
    apiKey: "YOUR_API_KEY",
    new OpenAIClientOptions
    {
        Endpoint = new Uri("https://api.routin.ai/v1")
    }
);

try
{
    var fileInfo = new FileInfo("speech.mp3");
    if (fileInfo.Length > 25 * 1024 * 1024)
    {
        throw new InvalidOperationException("File size exceeds 25MB limit");
    }

    using var audioStream = File.OpenRead("speech.mp3");
    var transcription = await client.TranscribeAudioAsync(
        audioStream,
        "speech.mp3"
    );

    Console.WriteLine(transcription.Value.Text);
}
catch (ClientResultException ex)
{
    Console.WriteLine($"API error: {ex.Message}");
}
catch (Exception ex)
{
    Console.WriteLine($"Error: {ex.Message}");
}

Common Error Codes

Error CodeDescriptionSolution
401Invalid API KeyCheck Authorization header
400Invalid file format or sizeCheck audio format and file size (< 25MB)
429Rate limit exceededReduce request frequency
500Server errorRetry later

Supported Audio Formats

Transcription and Translation:

  • MP3
  • MP4
  • MPEG
  • MPGA
  • M4A
  • WAV
  • WEBM

Text-to-Speech Output:

  • MP3 (default)
  • OPUS
  • AAC
  • FLAC
  • WAV
  • PCM

More Resources