Caching Strategies¶
LLM calls are expensive and slow. For identical or near-identical inputs, you can serve the cached result in milliseconds at near-zero cost. Two types of caching matter for LLM applications: exact-match caching for identical prompts, and semantic caching for similar queries.
Learning objectives¶
- Implement exact-match caching with TTL
- Build a semantic cache using embeddings
- Add a cache layer to a FastAPI endpoint
- Measure cache hit rate and cost savings
- Know when caching is appropriate vs harmful
Exact-match cache¶
import os
import hashlib
import json
import time
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
class ExactMatchCache:
def __init__(self, ttl_seconds: int = 3600):
self._store: dict[str, dict] = {}
self.ttl = ttl_seconds
self.hits = 0
self.misses = 0
def _key(self, messages: list[dict], model: str, temperature: float) -> str:
"""Deterministic cache key from request parameters."""
payload = json.dumps({"messages": messages, "model": model, "temperature": temperature}, sort_keys=True)
return hashlib.sha256(payload.encode()).hexdigest()
def get(self, messages: list[dict], model: str, temperature: float) -> str | None:
key = self._key(messages, model, temperature)
entry = self._store.get(key)
if entry and time.time() - entry["timestamp"] < self.ttl:
self.hits += 1
return entry["response"]
self.misses += 1
return None
def set(self, messages: list[dict], model: str, temperature: float, response: str) -> None:
key = self._key(messages, model, temperature)
self._store[key] = {"response": response, "timestamp": time.time()}
@property
def hit_rate(self) -> float:
total = self.hits + self.misses
return self.hits / total if total > 0 else 0.0
cache = ExactMatchCache(ttl_seconds=300)
def cached_completion(messages: list[dict], model: str = "gpt-4o-mini", temperature: float = 0.0) -> str:
"""Completion with exact-match caching. Cache only for deterministic calls (temperature=0)."""
if temperature > 0:
# Don't cache non-deterministic outputs
response = client.chat.completions.create(model=model, messages=messages, temperature=temperature)
return response.choices[0].message.content
cached = cache.get(messages, model, temperature)
if cached:
return cached
start = time.perf_counter()
response = client.chat.completions.create(model=model, messages=messages, temperature=temperature)
latency = (time.perf_counter() - start) * 1000
result = response.choices[0].message.content
cache.set(messages, model, temperature, result)
print(f" [MISS] API call: {latency:.0f}ms | tokens: {response.usage.total_tokens}")
return result
# Test caching
questions = [
"What is the capital of France?",
"What is the capital of France?", # Identical — should hit cache
"What is the capital of Germany?",
"What is the capital of France?", # Hit again
]
for q in questions:
msg = [{"role": "user", "content": q}]
result = cached_completion(msg)
status = "HIT " if cache.hits > 0 and q == questions[0] else "MISS" if "France" in q else "MISS"
print(f" Q: {q[:40]} → {result[:30]}...")
print(f"\nCache hit rate: {cache.hit_rate:.0%} ({cache.hits} hits / {cache.misses} misses)")
Semantic cache¶
Exact-match misses on "What is the capital of France?" vs "Tell me the capital city of France." Semantic caching handles this by checking if similar queries have cached answers.
import os
import time
import numpy as np
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
class SemanticCache:
def __init__(self, similarity_threshold: float = 0.92, ttl_seconds: int = 3600):
self.threshold = similarity_threshold
self.ttl = ttl_seconds
self._entries: list[dict] = [] # [{embedding, query, response, timestamp}]
self.hits = 0
self.misses = 0
self.embedding_calls = 0
def _embed(self, text: str) -> np.ndarray:
self.embedding_calls += 1
resp = client.embeddings.create(model="text-embedding-3-small", input=[text])
emb = np.array(resp.data[0].embedding)
return emb / np.linalg.norm(emb)
def get(self, query: str) -> str | None:
q_emb = self._embed(query)
now = time.time()
# Find the most similar cached entry above threshold
best_score = 0.0
best_entry = None
for entry in self._entries:
if now - entry["timestamp"] > self.ttl:
continue
score = float(q_emb @ entry["embedding"])
if score > best_score:
best_score = score
best_entry = entry
if best_entry and best_score >= self.threshold:
self.hits += 1
print(f" [SEMANTIC HIT] similarity={best_score:.3f} | matched: '{best_entry['query'][:50]}'")
return best_entry["response"]
self.misses += 1
return None
def set(self, query: str, response: str) -> None:
q_emb = self._embed(query)
self._entries.append({
"query": query, "response": response,
"embedding": q_emb, "timestamp": time.time()
})
sem_cache = SemanticCache(similarity_threshold=0.90)
def semantically_cached_completion(query: str) -> str:
cached = sem_cache.get(query)
if cached:
return cached
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": query}],
temperature=0.0,
)
result = response.choices[0].message.content
sem_cache.set(query, result)
return result
# Test semantic similarity matching
queries = [
"What is the capital of France?",
"Tell me the capital city of France.", # Semantically similar → HIT
"Which city serves as France's capital?", # Semantically similar → HIT
"What is the population of Paris?", # Different topic → MISS
"What is the capital of Germany?", # Different country → MISS
]
for q in queries:
result = semantically_cached_completion(q)
print(f"Q: {q[:55]:<55} A: {result[:30]}...")
print(f"\nHit rate: {sem_cache.hit_rate:.0%} | Embedding calls: {sem_cache.embedding_calls}")
Cache integration in FastAPI¶
import os
import time
from fastapi import FastAPI
from pydantic import BaseModel
from openai import AsyncOpenAI
import hashlib, json
app = FastAPI()
aclient = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
_cache: dict[str, dict] = {}
CACHE_TTL = 300 # 5 minutes
def cache_key(message: str) -> str:
return hashlib.md5(message.encode()).hexdigest()
class ChatRequest(BaseModel):
message: str
use_cache: bool = True
@app.post("/chat")
async def chat(request: ChatRequest):
key = cache_key(request.message)
if request.use_cache:
entry = _cache.get(key)
if entry and time.time() - entry["ts"] < CACHE_TTL:
return {**entry["data"], "cached": True, "cache_age_s": int(time.time() - entry["ts"])}
start = time.perf_counter()
response = await aclient.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": request.message}],
temperature=0.0,
)
latency_ms = (time.perf_counter() - start) * 1000
data = {
"response": response.choices[0].message.content,
"tokens": response.usage.total_tokens,
"latency_ms": round(latency_ms, 0),
}
_cache[key] = {"data": data, "ts": time.time()}
return {**data, "cached": False}
When to cache and when not to¶
| Cache type | Good for | Avoid for |
|---|---|---|
| Exact match | FAQ answers, fixed-template outputs | Personalized responses |
| Semantic | Variations of common questions | Creative tasks (temperature > 0) |
| Both | Static knowledge, documentation Q&A | Real-time data (prices, news) |
| Neither | User-specific, time-sensitive, stateful | Anything requiring freshness |
Never cache responses that include personal data or vary by user context
A cached response to "What's my account balance?" that's served to a different user is a data breach. Scope your cache key to include user_id for any personalized endpoint.