Evaluation and Testing¶
A demo that works once on a hand-picked input is not evidence that your system works. Evaluation means running your system on a representative set of inputs and measuring outcomes you defined before you saw the results.
Learning objectives¶
- Define evaluation metrics appropriate to your project type
- Build a test suite that runs against your live API
- Measure latency, cache hit rate, and quality metrics quantitatively
- Interpret results and identify the most impactful improvement to make
Evaluation by project type¶
Option A — RAG service¶
The three metrics that matter for RAG:
| Metric | What it measures | How to measure |
|---|---|---|
| Faithfulness | Does the answer only use facts from retrieved context? | RAGAS faithfulness or LLM-as-judge |
| Answer relevance | Does the answer address the actual question? | RAGAS answer_relevancy |
| Retrieval precision | Are the retrieved chunks relevant? | Human label or MRR on a known Q&A set |
# eval_rag.py
import json
import time
import httpx
from statistics import mean
# Define your test set: questions with known good answers
TEST_SET = [
{
"question": "How do I reset my password?",
"expected_keywords": ["reset", "email", "link"],
},
{
"question": "What payment methods do you accept?",
"expected_keywords": ["credit card", "PayPal"],
},
# Add 15–20 more for a meaningful evaluation
]
def keyword_precision(answer: str, keywords: list[str]) -> float:
"""Fraction of expected keywords present in the answer."""
answer_lower = answer.lower()
hits = sum(1 for kw in keywords if kw.lower() in answer_lower)
return hits / len(keywords) if keywords else 0.0
def run_rag_eval(base_url: str = "http://localhost:8000") -> dict:
results = []
latencies = []
cache_hits = 0
with httpx.Client(timeout=30.0) as client:
# First pass: cold cache
for item in TEST_SET:
start = time.perf_counter()
resp = client.post(f"{base_url}/chat", json={"question": item["question"], "use_cache": False})
latency_ms = (time.perf_counter() - start) * 1000
if resp.status_code != 200:
print(f"FAIL: {item['question'][:40]} → {resp.status_code}")
continue
data = resp.json()
precision = keyword_precision(data["answer"], item["expected_keywords"])
results.append({"question": item["question"], "precision": precision, "latency_ms": latency_ms})
latencies.append(latency_ms)
# Second pass: warm cache
for item in TEST_SET[:5]:
resp = client.post(f"{base_url}/chat", json={"question": item["question"], "use_cache": True})
if resp.json().get("cached"):
cache_hits += 1
avg_precision = mean(r["precision"] for r in results) if results else 0.0
p95_latency = sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0.0
print(f"\n=== RAG Evaluation Results ===")
print(f"Questions evaluated: {len(results)}")
print(f"Avg keyword precision: {avg_precision:.0%}")
print(f"P95 latency: {p95_latency:.0f}ms")
print(f"Cache hits (5 re-runs): {cache_hits}/5")
for r in sorted(results, key=lambda x: x["precision"]):
print(f" {r['precision']:.0%} | {r['latency_ms']:.0f}ms | {r['question'][:60]}")
return {"avg_precision": avg_precision, "p95_latency_ms": p95_latency, "cache_hits": cache_hits}
if __name__ == "__main__":
run_rag_eval()
Option B — LangGraph agent¶
Agents are harder to evaluate because output quality is subjective. Use a combination of structural checks (did it terminate? how many loops?) and LLM-as-judge for content quality.
# eval_agent.py
import asyncio
import time
from research_agent import run_research
TEST_QUESTIONS = [
"What are the tradeoffs between RAG and fine-tuning for LLM applications?",
"How does the transformer attention mechanism work?",
"What is the difference between LoRA and QLoRA?",
]
QUALITY_CRITERIA = [
"Has an executive summary",
"Cites specific technical details",
"Has a clear conclusion",
"Is longer than 300 words",
]
def structural_check(report: str) -> dict:
checks = {
"has_exec_summary": any(phrase in report.lower() for phrase in ["executive summary", "overview", "introduction"]),
"has_conclusion": any(phrase in report.lower() for phrase in ["conclusion", "in summary", "in conclusion"]),
"word_count_ok": len(report.split()) >= 300,
"has_sections": report.count("##") >= 2 or report.count("**") >= 3,
}
return checks
async def run_agent_eval() -> None:
total_attempts = 0
total_score = 0.0
results = []
for question in TEST_QUESTIONS:
start = time.perf_counter()
result = await run_research(question)
elapsed = time.perf_counter() - start
checks = structural_check(result["report"])
structural_score = sum(checks.values()) / len(checks)
results.append({
"question": question[:60],
"quality_score": result["quality_score"],
"structural_score": structural_score,
"attempts": result["attempts"],
"elapsed_s": round(elapsed, 1),
})
total_score += result["quality_score"]
total_attempts += result["attempts"]
print(f"\n=== Agent Evaluation Results ===")
print(f"Questions: {len(TEST_QUESTIONS)}")
print(f"Avg quality score: {total_score/len(TEST_QUESTIONS):.2f}")
print(f"Avg attempts (loops): {total_attempts/len(TEST_QUESTIONS):.1f}")
for r in results:
print(f" Q={r['quality_score']:.2f} S={r['structural_score']:.2f} att={r['attempts']} t={r['elapsed_s']}s | {r['question']}")
asyncio.run(run_agent_eval())
Option C — Document extraction¶
For extraction tasks, ground-truth comparison is the most rigorous evaluation.
# eval_extraction.py
import json
import httpx
# Ground truth: manually verified extractions from test PDFs
GROUND_TRUTH = [
{
"pdf": "test_invoice_001.pdf",
"expected": {
"vendor_name": "Acme Corp",
"total_amount": 1250.00,
"currency": "USD",
"invoice_number": "INV-2024-001",
}
},
# Add more test cases
]
def field_accuracy(predicted: dict, expected: dict) -> float:
correct = 0
for field, expected_val in expected.items():
pred_val = predicted.get(field)
if isinstance(expected_val, float):
correct += abs((pred_val or 0) - expected_val) < 0.01
else:
correct += str(pred_val).lower() == str(expected_val).lower()
return correct / len(expected) if expected else 0.0
def run_extraction_eval(base_url: str = "http://localhost:8000") -> dict:
results = []
with httpx.Client(timeout=30.0) as client:
for case in GROUND_TRUTH:
with open(case["pdf"], "rb") as f:
resp = client.post(
f"{base_url}/extract/invoice",
files={"file": (case["pdf"], f, "application/pdf")},
)
if resp.status_code != 200:
print(f"FAIL: {case['pdf']} → {resp.status_code}: {resp.text}")
continue
predicted = resp.json()
accuracy = field_accuracy(predicted, case["expected"])
results.append({"file": case["pdf"], "accuracy": accuracy, "confidence": predicted.get("confidence", 0)})
print(f" {accuracy:.0%} accuracy | conf={predicted.get('confidence', 0):.2f} | {case['pdf']}")
avg_accuracy = sum(r["accuracy"] for r in results) / len(results) if results else 0.0
print(f"\nAverage field accuracy: {avg_accuracy:.0%}")
return {"avg_accuracy": avg_accuracy, "n": len(results)}
if __name__ == "__main__":
run_extraction_eval()
API endpoint testing (all options)¶
Before running the eval suite, verify the endpoints themselves:
# test_endpoints.py
import httpx
def test_health(base_url: str = "http://localhost:8000") -> bool:
resp = httpx.get(f"{base_url}/health")
assert resp.status_code == 200, f"Health check failed: {resp.status_code}"
data = resp.json()
assert data["status"] == "ok"
print(f"Health: OK — {data}")
return True
def test_invalid_input(base_url: str = "http://localhost:8000") -> bool:
# Empty message should return 422
resp = httpx.post(f"{base_url}/chat", json={"question": ""})
assert resp.status_code == 422, f"Expected 422, got {resp.status_code}"
print("Invalid input: correctly rejected")
# Message too long
resp = httpx.post(f"{base_url}/chat", json={"question": "x" * 5000})
assert resp.status_code == 422
print("Oversized input: correctly rejected")
return True
if __name__ == "__main__":
test_health()
test_invalid_input()
print("All endpoint tests passed")
Evaluation targets for capstone
- RAG: avg keyword precision ≥ 70%, P95 latency < 3s, cache hit rate ≥ 50% on repeated queries
- Agent: avg quality score ≥ 0.75, avg loops ≤ 2.0
- Extraction: field accuracy ≥ 80% on ground-truth test set
Don't evaluate on your training/development inputs
If you only test on examples you already checked while building, you're measuring memorization not generalization. Your eval set should contain inputs you haven't looked at during development.