python Embeddings February 8, 2026

Embedding Similarity Checker

Compare texts semantically using embeddings and cosine similarity. Find similar documents, detect duplicates, and build search systems.

embeddingssimilaritycosinesemantic-searchvectors

Description

Understand how embeddings capture semantic meaning by comparing texts. Visualize similarity scores and find the most related content in a corpus.

Requirements

pip install openai numpy scikit-learn rich

similarity.py

#!/usr/bin/env python3
"""
Embedding Similarity - Compare texts semantically
Usage: python similarity.py "text1" "text2"
       python similarity.py -f corpus.txt -q "query"
"""

import os
import sys
import argparse
from typing import List, Tuple
import numpy as np
from openai import OpenAI
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import track

console = Console()
client = OpenAI()

# ════════════════════════════════════════════════════════════════════════════
# Embeddings
# ════════════════════════════════════════════════════════════════════════════

EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536

def get_embedding(text: str) -> np.ndarray:
    """Get embedding for a single text."""
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=text
    )
    return np.array(response.data[0].embedding)

def get_embeddings_batch(texts: List[str]) -> np.ndarray:
    """Get embeddings for multiple texts efficiently."""
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=texts
    )
    return np.array([d.embedding for d in response.data])

# ════════════════════════════════════════════════════════════════════════════
# Similarity Metrics
# ════════════════════════════════════════════════════════════════════════════

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate Euclidean distance between two vectors."""
    return np.linalg.norm(a - b)

def dot_product(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate dot product between two vectors."""
    return np.dot(a, b)

# ════════════════════════════════════════════════════════════════════════════
# Comparison Functions
# ════════════════════════════════════════════════════════════════════════════

def compare_two(text1: str, text2: str):
    """Compare two texts."""
    console.print(Panel(text1, title="[cyan]Text 1[/]", border_style="cyan"))
    console.print(Panel(text2, title="[cyan]Text 2[/]", border_style="cyan"))
    console.print()
    
    with console.status("Computing embeddings..."):
        emb1 = get_embedding(text1)
        emb2 = get_embedding(text2)
    
    cos_sim = cosine_similarity(emb1, emb2)
    euc_dist = euclidean_distance(emb1, emb2)
    dot = dot_product(emb1, emb2)
    
    # Similarity interpretation
    if cos_sim > 0.9:
        interpretation = "[green]Very similar - Nearly identical meaning[/]"
    elif cos_sim > 0.8:
        interpretation = "[green]Similar - Related concepts[/]"
    elif cos_sim > 0.6:
        interpretation = "[yellow]Somewhat related[/]"
    elif cos_sim > 0.4:
        interpretation = "[yellow]Weakly related[/]"
    else:
        interpretation = "[red]Not similar - Different topics[/]"
    
    table = Table(title="Similarity Metrics")
    table.add_column("Metric", style="cyan")
    table.add_column("Value", justify="right")
    table.add_column("Interpretation", style="dim")
    
    table.add_row("Cosine Similarity", f"{cos_sim:.4f}", f"{cos_sim*100:.1f}% similar")
    table.add_row("Euclidean Distance", f"{euc_dist:.4f}", "Lower = more similar")
    table.add_row("Dot Product", f"{dot:.4f}", "Higher = more similar")
    
    console.print(table)
    console.print()
    console.print(f"[bold]Interpretation:[/] {interpretation}")
    
    # Visual similarity bar
    bar_width = 40
    filled = int(cos_sim * bar_width)
    bar = "" * filled + "" * (bar_width - filled)
    console.print(f"\n[dim]Similarity:[/] [{get_color(cos_sim)}]{bar}[/] {cos_sim*100:.1f}%")

def get_color(sim: float) -> str:
    """Get color based on similarity."""
    if sim > 0.8: return "green"
    if sim > 0.6: return "yellow"
    return "red"

def search_corpus(query: str, corpus: List[str], top_k: int = 5):
    """Search for most similar texts in corpus."""
    console.print(Panel(query, title="[cyan]Query[/]", border_style="cyan"))
    console.print(f"\n[dim]Searching {len(corpus)} documents...[/]\n")
    
    with console.status("Computing embeddings..."):
        query_emb = get_embedding(query)
        corpus_embs = get_embeddings_batch(corpus)
    
    # Calculate similarities
    similarities = [cosine_similarity(query_emb, emb) for emb in corpus_embs]
    
    # Sort by similarity
    ranked = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    
    table = Table(title=f"Top {top_k} Results")
    table.add_column("#", style="dim", width=3)
    table.add_column("Text", style="white", max_width=60)
    table.add_column("Similarity", justify="right", style="green")
    
    for rank, (idx, sim) in enumerate(ranked[:top_k], 1):
        text_preview = corpus[idx][:80] + "..." if len(corpus[idx]) > 80 else corpus[idx]
        color = get_color(sim)
        table.add_row(str(rank), text_preview, f"[{color}]{sim*100:.1f}%[/]")
    
    console.print(table)

def similarity_matrix(texts: List[str], labels: List[str] = None):
    """Show similarity matrix for multiple texts."""
    n = len(texts)
    labels = labels or [f"Text {i+1}" for i in range(n)]
    
    with console.status("Computing embeddings..."):
        embeddings = get_embeddings_batch(texts)
    
    # Build similarity matrix
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            matrix[i][j] = cosine_similarity(embeddings[i], embeddings[j])
    
    # Display as table
    table = Table(title="Similarity Matrix")
    table.add_column("", style="cyan")
    for label in labels:
        table.add_column(label[:10], justify="center")
    
    for i, label in enumerate(labels):
        row = [label[:10]]
        for j in range(n):
            sim = matrix[i][j]
            color = get_color(sim)
            row.append(f"[{color}]{sim:.2f}[/]")
        table.add_row(*row)
    
    console.print(table)

# ════════════════════════════════════════════════════════════════════════════
# Examples
# ════════════════════════════════════════════════════════════════════════════

def run_examples():
    """Run built-in examples to demonstrate embeddings."""
    console.print("[bold cyan]Embedding Similarity Examples[/]\n")
    
    examples = [
        ("The cat sat on the mat", "A feline rested on the rug"),
        ("The cat sat on the mat", "Stock prices rose yesterday"),
        ("I love programming", "Coding is my passion"),
        ("The weather is nice today", "It's a beautiful sunny day"),
        ("Python is a programming language", "Python is a type of snake"),
    ]
    
    for text1, text2 in examples:
        console.print(f"[dim]'{text1}'[/]")
        console.print(f"[dim]'{text2}'[/]")
        
        emb1 = get_embedding(text1)
        emb2 = get_embedding(text2)
        sim = cosine_similarity(emb1, emb2)
        
        color = get_color(sim)
        bar_width = 20
        filled = int(sim * bar_width)
        bar = "" * filled + "" * (bar_width - filled)
        console.print(f"[{color}]{bar} {sim*100:.1f}%[/]\n")

# ════════════════════════════════════════════════════════════════════════════
# Main
# ════════════════════════════════════════════════════════════════════════════

def main():
    parser = argparse.ArgumentParser(description="Compare texts using embeddings")
    parser.add_argument("texts", nargs="*", help="Texts to compare")
    parser.add_argument("-f", "--file", help="Corpus file (one text per line)")
    parser.add_argument("-q", "--query", help="Query to search in corpus")
    parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results")
    parser.add_argument("--examples", action="store_true", help="Run examples")
    parser.add_argument("--matrix", action="store_true", help="Show similarity matrix")
    args = parser.parse_args()
    
    if args.examples:
        run_examples()
    elif args.file and args.query:
        with open(args.file, 'r') as f:
            corpus = [line.strip() for line in f if line.strip()]
        search_corpus(args.query, corpus, args.top_k)
    elif args.matrix and args.texts:
        similarity_matrix(args.texts)
    elif len(args.texts) == 2:
        compare_two(args.texts[0], args.texts[1])
    elif len(args.texts) > 2:
        similarity_matrix(args.texts)
    else:
        console.print("[yellow]Usage:[/]")
        console.print("  python similarity.py 'text1' 'text2'")
        console.print("  python similarity.py -f corpus.txt -q 'query'")
        console.print("  python similarity.py --examples")
        console.print("  python similarity.py --matrix 'a' 'b' 'c'")

if __name__ == "__main__":
    main()

Usage

# Compare two texts
python similarity.py "The cat sat on the mat" "A feline rested on the rug"

# Search a corpus
python similarity.py -f documents.txt -q "machine learning" -k 10

# Show similarity matrix
python similarity.py --matrix "AI" "Machine Learning" "Deep Learning" "Cooking"

# Run built-in examples
python similarity.py --examples

Example Output

╭──────────── Text 1 ────────────╮
│ The cat sat on the mat         │
╰────────────────────────────────╯
╭──────────── Text 2 ────────────╮
│ A feline rested on the rug     │
╰────────────────────────────────╯

      Similarity Metrics
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
┃ Metric             ┃   Value ┃ Interpretation       ┃
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
│ Cosine Similarity  │  0.8934 │ 89.3% similar        │
│ Euclidean Distance │  0.4621 │ Lower = more similar │
│ Dot Product        │  0.8934 │ Higher = more similar│
└────────────────────┴─────────┴──────────────────────┘

Interpretation: Very similar - Nearly identical meaning

Similarity: ████████████████████████████████████░░░░ 89.3%

Use Cases

  • Duplicate detection: Find near-duplicate content
  • Semantic search: Build search that understands meaning
  • Clustering: Group similar documents together
  • Recommendations: Find related content
  • Plagiarism detection: Identify similar submissions