LLM API Playground
A unified Python script to test and compare responses from OpenAI, Anthropic, and Ollama APIs side by side. Perfect for prompt iteration.
Description
Test prompts across multiple LLM providers with a single script. Compare response quality, latency, and token usage. Supports streaming and non-streaming modes.
Requirements
pip install openai anthropic httpx rich
playground.py
#!/usr/bin/env python3
"""
LLM API Playground - Test prompts across multiple providers
Usage: python playground.py "Your prompt here"
"""
import os
import sys
import time
import asyncio
from typing import Optional
from dataclasses import dataclass
# Rich for pretty output
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.markdown import Markdown
console = Console()
# ════════════════════════════════════════════════════════════════════════════
# Configuration
# ════════════════════════════════════════════════════════════════════════════
@dataclass
class ModelConfig:
provider: str
model: str
api_key_env: str
base_url: Optional[str] = None
MODELS = [
ModelConfig("OpenAI", "gpt-4o", "OPENAI_API_KEY"),
ModelConfig("Anthropic", "claude-sonnet-4-20250514", "ANTHROPIC_API_KEY"),
ModelConfig("Ollama", "llama3.2", "OLLAMA_API_KEY", "http://localhost:11434/v1"),
]
# ════════════════════════════════════════════════════════════════════════════
# Provider Clients
# ════════════════════════════════════════════════════════════════════════════
async def query_openai(prompt: str, config: ModelConfig) -> dict:
"""Query OpenAI-compatible API."""
from openai import AsyncOpenAI
api_key = os.getenv(config.api_key_env, "ollama") # Ollama doesn't need key
client = AsyncOpenAI(api_key=api_key, base_url=config.base_url)
start = time.perf_counter()
response = await client.chat.completions.create(
model=config.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=1024,
)
elapsed = time.perf_counter() - start
return {
"content": response.choices[0].message.content,
"tokens_in": response.usage.prompt_tokens,
"tokens_out": response.usage.completion_tokens,
"latency": elapsed,
}
async def query_anthropic(prompt: str, config: ModelConfig) -> dict:
"""Query Anthropic API."""
import anthropic
client = anthropic.AsyncAnthropic(api_key=os.getenv(config.api_key_env))
start = time.perf_counter()
response = await client.messages.create(
model=config.model,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
elapsed = time.perf_counter() - start
return {
"content": response.content[0].text,
"tokens_in": response.usage.input_tokens,
"tokens_out": response.usage.output_tokens,
"latency": elapsed,
}
# ════════════════════════════════════════════════════════════════════════════
# Main Logic
# ════════════════════════════════════════════════════════════════════════════
async def query_model(prompt: str, config: ModelConfig) -> dict:
"""Route to appropriate provider."""
try:
if config.provider == "Anthropic":
return await query_anthropic(prompt, config)
else:
return await query_openai(prompt, config)
except Exception as e:
return {"error": str(e)}
async def run_comparison(prompt: str):
"""Query all models and display comparison."""
console.print(Panel(prompt, title="[bold cyan]Prompt[/]", border_style="cyan"))
console.print()
# Query all models concurrently
tasks = [query_model(prompt, config) for config in MODELS]
results = await asyncio.gather(*tasks)
# Display results
for config, result in zip(MODELS, results):
if "error" in result:
console.print(Panel(
f"[red]Error: {result['error']}[/]",
title=f"[bold red]{config.provider} ({config.model})[/]",
border_style="red"
))
else:
stats = f"⏱ {result['latency']:.2f}s | 📥 {result['tokens_in']} | 📤 {result['tokens_out']} tokens"
console.print(Panel(
Markdown(result["content"]),
title=f"[bold green]{config.provider} ({config.model})[/]",
subtitle=stats,
border_style="green"
))
console.print()
# Summary table
table = Table(title="Performance Comparison")
table.add_column("Provider", style="cyan")
table.add_column("Model", style="magenta")
table.add_column("Latency", justify="right")
table.add_column("Tokens", justify="right")
for config, result in zip(MODELS, results):
if "error" not in result:
table.add_row(
config.provider,
config.model,
f"{result['latency']:.2f}s",
f"{result['tokens_in']} → {result['tokens_out']}"
)
console.print(table)
# ════════════════════════════════════════════════════════════════════════════
# Entry Point
# ════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
if len(sys.argv) < 2:
console.print("[yellow]Usage: python playground.py 'Your prompt here'[/]")
sys.exit(1)
prompt = " ".join(sys.argv[1:])
asyncio.run(run_comparison(prompt))
Environment Setup
# .env file
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
# For Ollama (no key needed, just run locally)
# ollama serve
Example Output
╭──────────────────── Prompt ────────────────────╮
│ Explain quantum entanglement in one sentence. │
╰────────────────────────────────────────────────╯
╭─ OpenAI (gpt-4o) ──────────────────────────────╮
│ Quantum entanglement is a phenomenon where │
│ two particles become correlated such that the │
│ quantum state of one instantly influences the │
│ other, regardless of distance. │
╰──────────── ⏱ 0.82s | 📥 14 | 📤 38 tokens ────╯
╭─ Anthropic (claude-sonnet-4-20250514) ─────────────────╮
│ Quantum entanglement occurs when particles │
│ become linked so measuring one instantly │
│ determines properties of the other, no matter │
│ how far apart they are. │
╰──────────── ⏱ 1.12s | 📥 15 | 📤 41 tokens ────╯
Tips
- Add more models: Edit the
MODELSlist to include other providers - Streaming: Add
stream=Truefor real-time output - Temperature: Add temperature parameter to compare creativity
- Save results: Pipe output to file for later analysis