Skip to content

Observability

Logging individual traces is necessary but insufficient. Observability means you can answer operational questions at scale: What's my p95 latency this week? Which user is responsible for 40% of my token spend? Did this model upgrade increase or decrease my error rate? This note covers metric aggregation, alerting, and dashboard design for LLM applications.

Learning objectives

  • Define the key metrics for LLM observability: latency, cost, error rate, quality
  • Build a metrics aggregator that computes percentiles and rates
  • Implement budget alerts before costs hit your limit
  • Design a dashboard structure for LLM applications
  • Set up Prometheus-compatible metrics counters

The four LLM observability signals

# 1. LATENCY: How long does each call take?
#    Key metrics: p50, p95, p99, time-to-first-token
#    Alert when: p95 exceeds SLA (e.g., > 5 seconds)

# 2. COST: How much are we spending?
#    Key metrics: cost per request, cost per user, daily/monthly total
#    Alert when: daily spend exceeds budget threshold

# 3. ERROR RATE: How often do calls fail?
#    Key metrics: % API errors, % timeout errors, % malformed outputs
#    Alert when: error rate > 1% sustained over 5 minutes

# 4. QUALITY: Are outputs good?
#    Key metrics: LLM-as-judge scores, user feedback rate, task success rate
#    Alert when: quality score drops > 5% vs baseline

Metrics aggregator

import time
import math
from collections import defaultdict
from dataclasses import dataclass, field

@dataclass
class MetricPoint:
    timestamp: float
    value: float
    labels: dict

class MetricsStore:
    def __init__(self):
        self._metrics: dict[str, list[MetricPoint]] = defaultdict(list)

    def record(self, metric_name: str, value: float, labels: dict = None) -> None:
        self._metrics[metric_name].append(
            MetricPoint(timestamp=time.time(), value=value, labels=labels or {})
        )

    def _recent(self, metric_name: str, window_seconds: int = 3600) -> list[float]:
        cutoff = time.time() - window_seconds
        return [p.value for p in self._metrics[metric_name] if p.timestamp > cutoff]

    def percentile(self, metric_name: str, p: float, window_seconds: int = 3600) -> float:
        values = sorted(self._recent(metric_name, window_seconds))
        if not values:
            return 0.0
        idx = math.ceil(p / 100 * len(values)) - 1
        return values[max(0, idx)]

    def mean(self, metric_name: str, window_seconds: int = 3600) -> float:
        values = self._recent(metric_name, window_seconds)
        return sum(values) / len(values) if values else 0.0

    def count(self, metric_name: str, window_seconds: int = 3600) -> int:
        return len(self._recent(metric_name, window_seconds))

    def sum(self, metric_name: str, window_seconds: int = 3600) -> float:
        return sum(self._recent(metric_name, window_seconds))

    def report(self) -> None:
        print("\n=== Metrics Report (last hour) ===")
        for metric, points in self._metrics.items():
            values = [p.value for p in points]
            if not values:
                continue
            print(f"\n{metric}:")
            print(f"  count={len(values)}, sum={sum(values):.4f}")
            sorted_vals = sorted(values)
            p50_idx = max(0, math.ceil(50 / 100 * len(sorted_vals)) - 1)
            p95_idx = max(0, math.ceil(95 / 100 * len(sorted_vals)) - 1)
            print(f"  p50={sorted_vals[p50_idx]:.2f}, p95={sorted_vals[p95_idx]:.2f}")

# Instrument your LLM wrapper
metrics = MetricsStore()

def instrumented_call(messages: list[dict], model: str = "gpt-4o-mini") -> str:
    import os
    from openai import OpenAI
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    start = time.perf_counter()
    error = False
    try:
        response = client.chat.completions.create(model=model, messages=messages, max_tokens=200)
        reply = response.choices[0].message.content
        usage = response.usage

        metrics.record("llm.latency_ms", (time.perf_counter() - start) * 1000, {"model": model})
        metrics.record("llm.prompt_tokens", usage.prompt_tokens, {"model": model})
        metrics.record("llm.completion_tokens", usage.completion_tokens, {"model": model})

        price = {"gpt-4o-mini": 0.15, "gpt-4o": 2.50}.get(model, 0.15)
        cost = usage.total_tokens / 1e6 * price
        metrics.record("llm.cost_usd", cost, {"model": model})
        metrics.record("llm.requests_total", 1, {"model": model, "status": "success"})

        return reply

    except Exception as e:
        error = True
        metrics.record("llm.requests_total", 1, {"model": model, "status": "error"})
        metrics.record("llm.latency_ms", (time.perf_counter() - start) * 1000, {"model": model})
        raise

# Run some calls and view metrics
import os
questions = ["What is Python?", "What is RAG?", "What is fine-tuning?"]
for q in questions:
    instrumented_call([{"role": "user", "content": q}])

metrics.report()
print(f"\np95 latency: {metrics.percentile('llm.latency_ms', 95):.0f}ms")
print(f"Total cost: ${metrics.sum('llm.cost_usd'):.6f}")

Budget alert system

import os
import time
from threading import Timer
from typing import Callable

class BudgetAlert:
    def __init__(
        self,
        daily_budget_usd: float,
        alert_thresholds: list[float] = [0.5, 0.75, 0.9, 1.0],
        on_alert: Callable[[str], None] = print,
    ):
        self.daily_budget = daily_budget_usd
        self.thresholds = sorted(alert_thresholds)
        self.on_alert = on_alert
        self._triggered: set[float] = set()
        self._daily_spend = 0.0
        self._day_start = time.time()

    def record_spend(self, amount_usd: float) -> None:
        # Reset daily counter if day has passed
        if time.time() - self._day_start > 86400:
            self._daily_spend = 0.0
            self._day_start = time.time()
            self._triggered.clear()

        self._daily_spend += amount_usd
        pct = self._daily_spend / self.daily_budget

        for threshold in self.thresholds:
            if pct >= threshold and threshold not in self._triggered:
                self._triggered.add(threshold)
                self.on_alert(
                    f"[BUDGET ALERT] {threshold*100:.0f}% of daily budget used: "
                    f"${self._daily_spend:.4f} / ${self.daily_budget:.2f}"
                )

# Example
def send_alert(message: str) -> None:
    print(f"🚨 {message}")
    # In production: send to Slack, PagerDuty, email, etc.

alert = BudgetAlert(daily_budget_usd=5.00, on_alert=send_alert)
for cost in [1.0, 1.5, 1.0, 0.75, 0.5]:  # Simulated spend over the day
    alert.record_spend(cost)

Prometheus metrics (production-grade)

from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time

# Define metrics
llm_requests = Counter("llm_requests_total", "Total LLM API calls", ["model", "function", "status"])
llm_latency = Histogram("llm_latency_seconds", "LLM call latency", ["model"], buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0])
llm_tokens = Counter("llm_tokens_total", "Total tokens consumed", ["model", "type"])
llm_cost = Counter("llm_cost_usd_total", "Total cost in USD", ["model"])

def prometheus_instrumented_call(messages: list, model: str = "gpt-4o-mini", function_name: str = "unknown") -> str:
    import os
    from openai import OpenAI
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    start = time.perf_counter()
    try:
        response = client.chat.completions.create(model=model, messages=messages, max_tokens=200)
        latency = time.perf_counter() - start

        llm_requests.labels(model=model, function=function_name, status="success").inc()
        llm_latency.labels(model=model).observe(latency)
        llm_tokens.labels(model=model, type="prompt").inc(response.usage.prompt_tokens)
        llm_tokens.labels(model=model, type="completion").inc(response.usage.completion_tokens)

        return response.choices[0].message.content

    except Exception as e:
        llm_requests.labels(model=model, function=function_name, status="error").inc()
        raise

# Start Prometheus metrics server on port 8001
# start_http_server(8001)
# Metrics available at http://localhost:8001/metrics

Prometheus + Grafana is the standard stack for LLM observability

Export metrics from prometheus_client, scrape with Prometheus, visualize in Grafana. Use histogram_quantile(0.95, ...) in Prometheus for p95 latency. Alert with AlertManager when thresholds are crossed.


04-latency-optimization | 06-practice-exercises