Tracing and Logging¶
An LLM call that fails silently is the worst kind of production bug. You see a bad output in your dashboard but you have no record of the prompt that caused it, the model version, the latency, or the token count. Tracing captures this context for every call. Logging captures it in a queryable, persistent format.
Learning objectives¶
- Implement structured logging for LLM requests and responses
- Build a trace context that captures all inputs and outputs
- Wrap the OpenAI client with a transparent logging layer
- Measure and log per-call latency and token usage
- Design a log schema that supports debugging and cost analysis
What to capture in every LLM trace¶
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional
import json
@dataclass
class LLMTrace:
# Identity
trace_id: str
session_id: str
timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
# Request
model: str = ""
messages: list[dict] = field(default_factory=list)
temperature: float = 0.0
max_tokens: Optional[int] = None
tools: Optional[list] = None
# Response
response: str = ""
finish_reason: str = ""
# Metrics
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
latency_ms: float = 0.0
cost_usd: float = 0.0
# Context
function_name: str = "" # Which part of your code called the LLM
tags: list[str] = field(default_factory=list)
error: Optional[str] = None
def to_json(self) -> str:
return json.dumps(asdict(self), default=str)
Logging wrapper for the OpenAI client¶
import os
import time
import uuid
import json
import logging
from openai import OpenAI
from dataclasses import dataclass, field, asdict
from typing import Optional
from datetime import datetime
# Configure structured logging
logging.basicConfig(
level=logging.INFO,
format='%(message)s', # JSON lines
)
logger = logging.getLogger("llm_tracer")
# Pricing per 1M tokens (update as prices change)
PRICING = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4-turbo": {"input": 10.00, "output": 30.00},
"claude-sonnet-4-6": {"input": 3.00, "output": 15.00},
}
def compute_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
price = PRICING.get(model, {"input": 0.15, "output": 0.60})
return (prompt_tokens / 1e6 * price["input"]) + (completion_tokens / 1e6 * price["output"])
class TracedOpenAI:
"""OpenAI client wrapper that logs every call as a structured trace."""
def __init__(self, session_id: str = None):
self._client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.session_id = session_id or str(uuid.uuid4())[:8]
self.traces: list[dict] = []
def chat(
self,
messages: list[dict],
model: str = "gpt-4o-mini",
function_name: str = "unknown",
tags: list[str] = None,
**kwargs,
) -> str:
trace_id = str(uuid.uuid4())[:8]
start_time = time.perf_counter()
error = None
try:
response = self._client.chat.completions.create(
model=model,
messages=messages,
**kwargs,
)
reply = response.choices[0].message.content
finish_reason = response.choices[0].finish_reason
usage = response.usage
except Exception as e:
error = str(e)
raise
finally:
latency_ms = (time.perf_counter() - start_time) * 1000
trace = {
"trace_id": trace_id,
"session_id": self.session_id,
"timestamp": datetime.utcnow().isoformat(),
"model": model,
"function_name": function_name,
"tags": tags or [],
"prompt_tokens": getattr(usage, "prompt_tokens", 0) if not error else 0,
"completion_tokens": getattr(usage, "completion_tokens", 0) if not error else 0,
"total_tokens": getattr(usage, "total_tokens", 0) if not error else 0,
"cost_usd": compute_cost(
model,
getattr(usage, "prompt_tokens", 0) if not error else 0,
getattr(usage, "completion_tokens", 0) if not error else 0,
),
"latency_ms": round(latency_ms, 1),
"finish_reason": finish_reason if not error else "error",
"error": error,
"input_preview": messages[-1]["content"][:100] if messages else "",
"output_preview": reply[:100] if not error else "",
}
self.traces.append(trace)
logger.info(json.dumps(trace))
return reply
def summary(self) -> dict:
if not self.traces:
return {}
return {
"total_calls": len(self.traces),
"total_tokens": sum(t["total_tokens"] for t in self.traces),
"total_cost_usd": round(sum(t["cost_usd"] for t in self.traces), 6),
"avg_latency_ms": round(sum(t["latency_ms"] for t in self.traces) / len(self.traces), 1),
"errors": sum(1 for t in self.traces if t["error"]),
}
# Usage
client = TracedOpenAI(session_id="demo")
client.chat(
messages=[{"role": "user", "content": "What is Python?"}],
model="gpt-4o-mini",
function_name="explain_concept",
tags=["education", "python"],
)
client.chat(
messages=[{"role": "user", "content": "Give me a code example."}],
model="gpt-4o-mini",
function_name="code_example",
tags=["education", "python", "code"],
)
print("\nSession summary:")
print(json.dumps(client.summary(), indent=2))
Log schema best practices¶
# Good log entry — queryable, complete
GOOD_LOG = {
"trace_id": "a3f8c2",
"session_id": "user-XYZ-session-001",
"timestamp": "2025-05-24T10:23:45.123Z",
"model": "gpt-4o-mini",
"function_name": "classify_ticket", # Where in your code this was called
"tags": ["support", "classification"],
"prompt_tokens": 245,
"completion_tokens": 3,
"cost_usd": 0.0000037,
"latency_ms": 412,
"finish_reason": "stop",
"error": None,
}
# Bad log entry — not actionable when things go wrong
BAD_LOG = {
"time": "10:23",
"response": "negative", # No cost, no tokens, no request context
}
Log input previews and output previews, not full prompts
Full prompts can be thousands of tokens. Store previews (first 100–200 chars) in your fast log. Store full payloads in cheap object storage (S3, GCS) keyed by trace_id — retrieve on demand when debugging.