Observability¶
Logging individual traces is necessary but insufficient. Observability means you can answer operational questions at scale: What's my p95 latency this week? Which user is responsible for 40% of my token spend? Did this model upgrade increase or decrease my error rate? This note covers metric aggregation, alerting, and dashboard design for LLM applications.
Learning objectives¶
- Define the key metrics for LLM observability: latency, cost, error rate, quality
- Build a metrics aggregator that computes percentiles and rates
- Implement budget alerts before costs hit your limit
- Design a dashboard structure for LLM applications
- Set up Prometheus-compatible metrics counters
The four LLM observability signals¶
# 1. LATENCY: How long does each call take?
# Key metrics: p50, p95, p99, time-to-first-token
# Alert when: p95 exceeds SLA (e.g., > 5 seconds)
# 2. COST: How much are we spending?
# Key metrics: cost per request, cost per user, daily/monthly total
# Alert when: daily spend exceeds budget threshold
# 3. ERROR RATE: How often do calls fail?
# Key metrics: % API errors, % timeout errors, % malformed outputs
# Alert when: error rate > 1% sustained over 5 minutes
# 4. QUALITY: Are outputs good?
# Key metrics: LLM-as-judge scores, user feedback rate, task success rate
# Alert when: quality score drops > 5% vs baseline
Metrics aggregator¶
import time
import math
from collections import defaultdict
from dataclasses import dataclass, field
@dataclass
class MetricPoint:
timestamp: float
value: float
labels: dict
class MetricsStore:
def __init__(self):
self._metrics: dict[str, list[MetricPoint]] = defaultdict(list)
def record(self, metric_name: str, value: float, labels: dict = None) -> None:
self._metrics[metric_name].append(
MetricPoint(timestamp=time.time(), value=value, labels=labels or {})
)
def _recent(self, metric_name: str, window_seconds: int = 3600) -> list[float]:
cutoff = time.time() - window_seconds
return [p.value for p in self._metrics[metric_name] if p.timestamp > cutoff]
def percentile(self, metric_name: str, p: float, window_seconds: int = 3600) -> float:
values = sorted(self._recent(metric_name, window_seconds))
if not values:
return 0.0
idx = math.ceil(p / 100 * len(values)) - 1
return values[max(0, idx)]
def mean(self, metric_name: str, window_seconds: int = 3600) -> float:
values = self._recent(metric_name, window_seconds)
return sum(values) / len(values) if values else 0.0
def count(self, metric_name: str, window_seconds: int = 3600) -> int:
return len(self._recent(metric_name, window_seconds))
def sum(self, metric_name: str, window_seconds: int = 3600) -> float:
return sum(self._recent(metric_name, window_seconds))
def report(self) -> None:
print("\n=== Metrics Report (last hour) ===")
for metric, points in self._metrics.items():
values = [p.value for p in points]
if not values:
continue
print(f"\n{metric}:")
print(f" count={len(values)}, sum={sum(values):.4f}")
sorted_vals = sorted(values)
p50_idx = max(0, math.ceil(50 / 100 * len(sorted_vals)) - 1)
p95_idx = max(0, math.ceil(95 / 100 * len(sorted_vals)) - 1)
print(f" p50={sorted_vals[p50_idx]:.2f}, p95={sorted_vals[p95_idx]:.2f}")
# Instrument your LLM wrapper
metrics = MetricsStore()
def instrumented_call(messages: list[dict], model: str = "gpt-4o-mini") -> str:
import os
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
start = time.perf_counter()
error = False
try:
response = client.chat.completions.create(model=model, messages=messages, max_tokens=200)
reply = response.choices[0].message.content
usage = response.usage
metrics.record("llm.latency_ms", (time.perf_counter() - start) * 1000, {"model": model})
metrics.record("llm.prompt_tokens", usage.prompt_tokens, {"model": model})
metrics.record("llm.completion_tokens", usage.completion_tokens, {"model": model})
price = {"gpt-4o-mini": 0.15, "gpt-4o": 2.50}.get(model, 0.15)
cost = usage.total_tokens / 1e6 * price
metrics.record("llm.cost_usd", cost, {"model": model})
metrics.record("llm.requests_total", 1, {"model": model, "status": "success"})
return reply
except Exception as e:
error = True
metrics.record("llm.requests_total", 1, {"model": model, "status": "error"})
metrics.record("llm.latency_ms", (time.perf_counter() - start) * 1000, {"model": model})
raise
# Run some calls and view metrics
import os
questions = ["What is Python?", "What is RAG?", "What is fine-tuning?"]
for q in questions:
instrumented_call([{"role": "user", "content": q}])
metrics.report()
print(f"\np95 latency: {metrics.percentile('llm.latency_ms', 95):.0f}ms")
print(f"Total cost: ${metrics.sum('llm.cost_usd'):.6f}")
Budget alert system¶
import os
import time
from threading import Timer
from typing import Callable
class BudgetAlert:
def __init__(
self,
daily_budget_usd: float,
alert_thresholds: list[float] = [0.5, 0.75, 0.9, 1.0],
on_alert: Callable[[str], None] = print,
):
self.daily_budget = daily_budget_usd
self.thresholds = sorted(alert_thresholds)
self.on_alert = on_alert
self._triggered: set[float] = set()
self._daily_spend = 0.0
self._day_start = time.time()
def record_spend(self, amount_usd: float) -> None:
# Reset daily counter if day has passed
if time.time() - self._day_start > 86400:
self._daily_spend = 0.0
self._day_start = time.time()
self._triggered.clear()
self._daily_spend += amount_usd
pct = self._daily_spend / self.daily_budget
for threshold in self.thresholds:
if pct >= threshold and threshold not in self._triggered:
self._triggered.add(threshold)
self.on_alert(
f"[BUDGET ALERT] {threshold*100:.0f}% of daily budget used: "
f"${self._daily_spend:.4f} / ${self.daily_budget:.2f}"
)
# Example
def send_alert(message: str) -> None:
print(f"🚨 {message}")
# In production: send to Slack, PagerDuty, email, etc.
alert = BudgetAlert(daily_budget_usd=5.00, on_alert=send_alert)
for cost in [1.0, 1.5, 1.0, 0.75, 0.5]: # Simulated spend over the day
alert.record_spend(cost)
Prometheus metrics (production-grade)¶
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time
# Define metrics
llm_requests = Counter("llm_requests_total", "Total LLM API calls", ["model", "function", "status"])
llm_latency = Histogram("llm_latency_seconds", "LLM call latency", ["model"], buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0])
llm_tokens = Counter("llm_tokens_total", "Total tokens consumed", ["model", "type"])
llm_cost = Counter("llm_cost_usd_total", "Total cost in USD", ["model"])
def prometheus_instrumented_call(messages: list, model: str = "gpt-4o-mini", function_name: str = "unknown") -> str:
import os
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
start = time.perf_counter()
try:
response = client.chat.completions.create(model=model, messages=messages, max_tokens=200)
latency = time.perf_counter() - start
llm_requests.labels(model=model, function=function_name, status="success").inc()
llm_latency.labels(model=model).observe(latency)
llm_tokens.labels(model=model, type="prompt").inc(response.usage.prompt_tokens)
llm_tokens.labels(model=model, type="completion").inc(response.usage.completion_tokens)
return response.choices[0].message.content
except Exception as e:
llm_requests.labels(model=model, function=function_name, status="error").inc()
raise
# Start Prometheus metrics server on port 8001
# start_http_server(8001)
# Metrics available at http://localhost:8001/metrics
Prometheus + Grafana is the standard stack for LLM observability
Export metrics from prometheus_client, scrape with Prometheus, visualize in Grafana. Use histogram_quantile(0.95, ...) in Prometheus for p95 latency. Alert with AlertManager when thresholds are crossed.