Evaluation — Fine-Tuned Classifier¶
The comparison you're trying to answer¶
Does fine-tuning a small model outperform prompting a large model on this task?
Run both models on the same held-out test set and compare F1 scores. If fine-tuned wins: fine-tuning was worth it. If zero-shot wins: the task wasn't narrow enough or your training data was insufficient.
Evaluation script¶
# eval.py
import os
import json
import asyncio
import httpx
from openai import AsyncOpenAI
from sklearn.metrics import classification_report, f1_score
from dotenv import load_dotenv
load_dotenv()
aclient = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
SYSTEM_PROMPT = (
"Classify the sentiment of the following product review. "
"Respond with exactly one word: positive, negative, or neutral."
)
def load_test_set(path: str = "data/test.jsonl") -> list[dict]:
with open(path) as f:
return [json.loads(line) for line in f]
def extract_label(messages: list[dict]) -> str:
"""Extract ground truth label from the assistant message."""
for msg in messages:
if msg["role"] == "assistant":
return msg["content"].strip().lower()
return "unknown"
def extract_text(messages: list[dict]) -> str:
for msg in messages:
if msg["role"] == "user":
return msg["content"]
return ""
# ---- Zero-shot baseline ----
async def predict_zero_shot(text: str) -> str:
resp = await aclient.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": text},
],
temperature=0.0,
max_tokens=5,
)
raw = resp.choices[0].message.content.strip().lower()
return next((l for l in ["positive", "negative", "neutral"] if l in raw), "unknown")
async def run_zero_shot_eval(test_set: list[dict]) -> tuple[list[str], list[str]]:
texts = [extract_text(ex["messages"]) for ex in test_set]
labels = [extract_label(ex["messages"]) for ex in test_set]
predictions = await asyncio.gather(*[predict_zero_shot(t) for t in texts])
return labels, list(predictions)
# ---- Fine-tuned model ----
def run_finetuned_eval(test_set: list[dict], base_url: str = "http://localhost:8000") -> tuple[list[str], list[str]]:
labels = [extract_label(ex["messages"]) for ex in test_set]
predictions = []
with httpx.Client(timeout=30.0) as client:
for ex in test_set:
text = extract_text(ex["messages"])
resp = client.post(f"{base_url}/classify", json={"text": text})
pred = resp.json()["label"] if resp.status_code == 200 else "unknown"
predictions.append(pred)
return labels, predictions
# ---- Compare ----
def compare(zero_shot_labels, zero_shot_preds, finetuned_labels, finetuned_preds) -> None:
zs_f1 = f1_score(zero_shot_labels, zero_shot_preds, average="macro", zero_division=0)
ft_f1 = f1_score(finetuned_labels, finetuned_preds, average="macro", zero_division=0)
print("\n=== Zero-Shot (gpt-4o-mini) ===")
print(classification_report(zero_shot_labels, zero_shot_preds, zero_division=0))
print(f"Macro F1: {zs_f1:.3f}")
print("\n=== Fine-Tuned (Qwen2-0.5B-Instruct + QLoRA) ===")
print(classification_report(finetuned_labels, finetuned_preds, zero_division=0))
print(f"Macro F1: {ft_f1:.3f}")
print(f"\n=== Summary ===")
print(f"Zero-shot F1: {zs_f1:.3f}")
print(f"Fine-tuned F1: {ft_f1:.3f}")
winner = "Fine-tuned" if ft_f1 > zs_f1 else "Zero-shot"
delta = abs(ft_f1 - zs_f1)
print(f"Winner: {winner} by {delta:.3f}")
async def main():
test_set = load_test_set()
print(f"Test set: {len(test_set)} examples")
print("\nRunning zero-shot evaluation...")
zs_labels, zs_preds = await run_zero_shot_eval(test_set)
print("\nRunning fine-tuned evaluation (start the app.py server first)...")
ft_labels, ft_preds = run_finetuned_eval(test_set)
compare(zs_labels, zs_preds, ft_labels, ft_preds)
if __name__ == "__main__":
asyncio.run(main())
Expected results¶
On a well-prepared 150-example dataset (50 per class):
| Model | Macro F1 | Notes |
|---|---|---|
| Zero-shot gpt-4o-mini | 0.88–0.92 | Strong baseline for sentiment |
| 5-shot gpt-4o-mini | 0.90–0.94 | Marginal improvement |
| Fine-tuned Qwen2-0.5B | 0.85–0.95 | Depends heavily on data quality |
Fine-tuning a 0.5B model often matches gpt-4o-mini zero-shot on simple classification — and is much cheaper at inference time.
When fine-tuning beats zero-shot prompting
Fine-tuning wins on: domain-specific jargon not in the training data ("churned", "MRR", "CAC" in a SaaS context), consistent output format requirements, very high volume (inference cost savings), and latency-sensitive applications.
Zero-shot prompting wins on: tasks requiring broad knowledge, small datasets (< 50 examples per class), and when the task changes frequently.