Evaluation — Document Summarizer¶
Evaluation metrics for summarization¶
| Metric | What it measures | Implementation |
|---|---|---|
| Coverage | Key facts from source present in summary | Word overlap or LLM judge |
| Faithfulness | No claims in summary that contradict the source | RAGAS or LLM judge |
| Compression ratio | How much was condensed | len(summary) / len(source) |
| Coherence | Does the summary read as a complete, logical text? | LLM judge (1–5 scale) |
| Format adherence | Does bullets output have bullets? Executive have TL;DR? | Rule-based checks |
Ground truth test set¶
Create reference summaries for 10–15 documents. These are your ground truth — manually written or taken from article abstracts:
# test_documents.py
EVAL_SET = [
{
"name": "Wikipedia: Python (programming language)",
"text": "Python is a high-level, general-purpose programming language...", # first 2000 chars
"reference_summary": "Python is a versatile, interpreted programming language known for its readable syntax and large standard library. Created by Guido van Rossum in 1991, it supports multiple programming paradigms and is widely used in web development, data science, and AI.",
"key_facts": ["Guido van Rossum", "1991", "interpreted", "multiple paradigms"],
},
# Add 9–14 more
]
LLM-as-judge evaluation¶
# eval.py
import os
import httpx
from openai import OpenAI
from test_documents import EVAL_SET
from dotenv import load_dotenv
load_dotenv()
judge = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def judge_faithfulness(summary: str, source: str) -> float:
resp = judge.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": (
f"Does this summary contain only information that can be found in the source text? "
f"Rate 0.0 (contains false/hallucinated claims) to 1.0 (entirely faithful).\n"
f"Respond with only a number.\n\n"
f"Source (first 800 chars): {source[:800]}\n\n"
f"Summary: {summary}"
),
}],
temperature=0.0, max_tokens=5,
)
try:
return float(resp.choices[0].message.content.strip())
except ValueError:
return 0.5
def judge_coherence(summary: str) -> float:
resp = judge.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": (
f"Rate the coherence of this summary from 0.0 (incoherent, disjointed) to 1.0 (clear and logically structured).\n"
f"Respond with only a number.\n\nSummary: {summary}"
),
}],
temperature=0.0, max_tokens=5,
)
try:
return float(resp.choices[0].message.content.strip())
except ValueError:
return 0.5
def key_fact_coverage(summary: str, key_facts: list[str]) -> float:
summary_lower = summary.lower()
hits = sum(1 for fact in key_facts if fact.lower() in summary_lower)
return hits / len(key_facts) if key_facts else 0.0
def run_evaluation(base_url: str = "http://localhost:8000") -> dict:
results = []
with httpx.Client(timeout=60.0) as client:
for doc in EVAL_SET:
resp = client.post(
f"{base_url}/summarize/text",
params={"format_type": "paragraph", "style": "concise"},
json=doc["text"],
)
if resp.status_code != 200:
print(f"FAIL: {doc['name']} → {resp.status_code}")
continue
summary = resp.json()["summary"]
compression = len(summary) / len(doc["text"])
faithfulness = judge_faithfulness(summary, doc["text"])
coherence = judge_coherence(summary)
coverage = key_fact_coverage(summary, doc.get("key_facts", []))
results.append({
"name": doc["name"][:40],
"faithfulness": faithfulness,
"coherence": coherence,
"coverage": coverage,
"compression": compression,
})
print(f" F={faithfulness:.2f} C={coherence:.2f} Cov={coverage:.0%} | {doc['name'][:40]}")
avg = lambda k: sum(r[k] for r in results) / len(results) if results else 0
print(f"\n=== Summarizer Evaluation ===")
print(f"Faithfulness: {avg('faithfulness'):.2f}")
print(f"Coherence: {avg('coherence'):.2f}")
print(f"Coverage: {avg('coverage'):.0%}")
print(f"Compression: {avg('compression'):.0%} of original length")
return {k: avg(k) for k in ["faithfulness", "coherence", "coverage", "compression"]}
if __name__ == "__main__":
run_evaluation()
Format adherence checks¶
def check_format_adherence(summary: str, format_type: str) -> bool:
if format_type == "bullets":
lines = summary.strip().splitlines()
bullet_lines = [l for l in lines if l.strip().startswith(("-", "•", "*"))]
return len(bullet_lines) >= 3
elif format_type == "executive":
return any(phrase in summary.lower() for phrase in ["tldr", "tl;dr", "in brief", "summary:"])
return True # paragraph format has no strict structural requirement
Target metrics
- Faithfulness ≥ 0.85 (< 15% of summaries contain hallucinations)
- Coherence ≥ 0.80
- Key fact coverage ≥ 70%
- Compression ratio: 5–15% of original length for paragraph summaries