Token Counter
Count tokens for any text using multiple tokenizers. Supports OpenAI (tiktoken), Llama, Mistral, and Claude. Essential for prompt engineering.
Description
Accurately count tokens before sending prompts to LLMs. Avoid truncation and optimize context usage. Supports all major tokenizers.
Requirements
pip install tiktoken transformers rich
token_counter.py
#!/usr/bin/env python3
"""
Token Counter - Multi-tokenizer token counting
Usage: python token_counter.py "Your text here"
python token_counter.py -f file.txt
echo "text" | python token_counter.py
"""
import sys
import argparse
from typing import Dict
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
console = Console()
# ════════════════════════════════════════════════════════════════════════════
# Tokenizers
# ════════════════════════════════════════════════════════════════════════════
def count_tiktoken(text: str, encoding: str = "cl100k_base") -> Dict:
"""Count tokens using tiktoken (OpenAI)."""
import tiktoken
enc = tiktoken.get_encoding(encoding)
tokens = enc.encode(text)
return {
"count": len(tokens),
"tokens": tokens[:10], # First 10 for preview
"encoding": encoding
}
def count_gpt4(text: str) -> Dict:
"""Count tokens for GPT-4 / GPT-4o."""
return count_tiktoken(text, "cl100k_base")
def count_gpt35(text: str) -> Dict:
"""Count tokens for GPT-3.5."""
return count_tiktoken(text, "cl100k_base")
def count_llama(text: str) -> Dict:
"""Count tokens using Llama tokenizer."""
try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-2-7b-hf",
use_fast=True,
trust_remote_code=True
)
tokens = tokenizer.encode(text)
return {
"count": len(tokens),
"tokens": tokens[:10],
"encoding": "llama-2"
}
except Exception as e:
return {"error": str(e)}
def count_mistral(text: str) -> Dict:
"""Count tokens using Mistral tokenizer."""
try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"mistralai/Mistral-7B-v0.1",
use_fast=True
)
tokens = tokenizer.encode(text)
return {
"count": len(tokens),
"tokens": tokens[:10],
"encoding": "mistral"
}
except Exception as e:
return {"error": str(e)}
def count_claude(text: str) -> Dict:
"""Estimate tokens for Claude (similar to GPT-4)."""
# Claude uses a similar tokenizer to GPT-4
# This is an approximation
result = count_tiktoken(text, "cl100k_base")
result["encoding"] = "claude (estimated)"
return result
# ════════════════════════════════════════════════════════════════════════════
# Token Breakdown
# ════════════════════════════════════════════════════════════════════════════
def analyze_text(text: str) -> Dict:
"""Analyze text characteristics."""
words = text.split()
lines = text.split('\n')
chars = len(text)
return {
"characters": chars,
"words": len(words),
"lines": len(lines),
"avg_word_length": chars / len(words) if words else 0
}
# ════════════════════════════════════════════════════════════════════════════
# Cost Estimation
# ════════════════════════════════════════════════════════════════════════════
# Prices per 1M tokens (input) as of 2024
PRICING = {
"gpt-4o": 2.50,
"gpt-4o-mini": 0.15,
"gpt-4-turbo": 10.00,
"claude-3-opus": 15.00,
"claude-3-sonnet": 3.00,
"claude-3-haiku": 0.25,
}
def estimate_cost(tokens: int) -> Dict[str, float]:
"""Estimate cost for different models."""
return {
model: (tokens / 1_000_000) * price
for model, price in PRICING.items()
}
# ════════════════════════════════════════════════════════════════════════════
# Display
# ════════════════════════════════════════════════════════════════════════════
def display_results(text: str):
"""Display token counts and analysis."""
# Text preview
preview = text[:200] + "..." if len(text) > 200 else text
console.print(Panel(preview, title="[bold cyan]Text Preview[/]", border_style="cyan"))
console.print()
# Text analysis
analysis = analyze_text(text)
console.print(f"[dim]Characters:[/] {analysis['characters']:,}")
console.print(f"[dim]Words:[/] {analysis['words']:,}")
console.print(f"[dim]Lines:[/] {analysis['lines']:,}")
console.print()
# Token counts
table = Table(title="Token Counts by Model")
table.add_column("Model / Tokenizer", style="cyan")
table.add_column("Tokens", justify="right", style="green")
table.add_column("Encoding", style="dim")
tokenizers = [
("GPT-4 / GPT-4o", count_gpt4),
("Claude 3", count_claude),
]
# Try HuggingFace tokenizers (may fail without auth)
try:
tokenizers.append(("Llama 2", count_llama))
tokenizers.append(("Mistral", count_mistral))
except:
pass
token_counts = {}
for name, func in tokenizers:
result = func(text)
if "error" in result:
table.add_row(name, "[red]Error[/]", result.get("error", "")[:30])
else:
table.add_row(name, f"{result['count']:,}", result['encoding'])
token_counts[name] = result['count']
console.print(table)
console.print()
# Cost estimation (using GPT-4 count)
if token_counts:
gpt4_tokens = token_counts.get("GPT-4 / GPT-4o", list(token_counts.values())[0])
costs = estimate_cost(gpt4_tokens)
cost_table = Table(title="Estimated Input Cost")
cost_table.add_column("Model", style="cyan")
cost_table.add_column("Cost", justify="right", style="yellow")
for model, cost in sorted(costs.items(), key=lambda x: x[1]):
if cost < 0.01:
cost_str = f"${cost:.6f}"
else:
cost_str = f"${cost:.4f}"
cost_table.add_row(model, cost_str)
console.print(cost_table)
# ════════════════════════════════════════════════════════════════════════════
# Main
# ════════════════════════════════════════════════════════════════════════════
def main():
parser = argparse.ArgumentParser(description="Count tokens for LLM input")
parser.add_argument("text", nargs="*", help="Text to tokenize")
parser.add_argument("-f", "--file", help="Read from file")
args = parser.parse_args()
# Get input text
if args.file:
with open(args.file, 'r') as f:
text = f.read()
elif args.text:
text = " ".join(args.text)
elif not sys.stdin.isatty():
text = sys.stdin.read()
else:
console.print("[yellow]Usage: python token_counter.py 'text' or -f file.txt[/]")
sys.exit(1)
display_results(text)
if __name__ == "__main__":
main()
Usage
# Count tokens in a string
python token_counter.py "Hello, how are you today?"
# Count tokens in a file
python token_counter.py -f my_prompt.txt
# Pipe from another command
cat document.md | python token_counter.py
# Count your system prompt
python token_counter.py -f system_prompt.md
Example Output
╭───────────────── Text Preview ─────────────────╮
│ Hello, how are you today? I'm working on a │
│ project that involves natural language... │
╰────────────────────────────────────────────────╯
Characters: 1,247
Words: 203
Lines: 12
Token Counts by Model
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Model ┃ Tokens ┃ Encoding ┃
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ GPT-4 / GPT-4o │ 284 │ cl100k_base │
│ Claude 3 │ 284 │ claude (est.) │
│ Llama 2 │ 291 │ llama-2 │
│ Mistral │ 288 │ mistral │
└──────────────────┴────────┴───────────────┘
Estimated Input Cost
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Model ┃ Cost ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ gpt-4o-mini │ $0.000043 │
│ claude-3-haiku │ $0.000071 │
│ gpt-4o │ $0.000710 │
│ claude-3-sonnet │ $0.000852 │
│ gpt-4-turbo │ $0.002840 │
│ claude-3-opus │ $0.004260 │
└─────────────────┴───────────┘
Tips
- Context planning: Know your token budget before building prompts
- Compare models: Different tokenizers = different counts
- Cost awareness: Track costs during development
- Batch estimation: Pipe multiple files to estimate dataset costs