FastAPI Wrappers¶
Wrapping your LLM pipeline in a FastAPI endpoint is the first step to making it accessible as a service. A production endpoint needs more than just calling the OpenAI API — it needs request validation, authentication, rate limiting, error handling, and structured responses that client code can depend on.
Learning objectives¶
- Build a FastAPI app that wraps an LLM pipeline
- Define Pydantic request and response models
- Implement API key authentication with dependency injection
- Add rate limiting and request size validation
- Return proper HTTP status codes for LLM errors
Minimal working FastAPI LLM endpoint¶
# main.py
import os
from fastapi import FastAPI, HTTPException, Header
from pydantic import BaseModel, Field
from typing import Optional
from openai import OpenAI, APIError, RateLimitError
app = FastAPI(title="LLM API", version="1.0.0")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
class ChatRequest(BaseModel):
message: str = Field(..., min_length=1, max_length=4000, description="User message")
system_prompt: Optional[str] = Field(None, max_length=2000)
temperature: float = Field(0.0, ge=0.0, le=2.0)
max_tokens: int = Field(500, ge=1, le=4000)
class ChatResponse(BaseModel):
response: str
model: str
prompt_tokens: int
completion_tokens: int
cost_usd: float
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
messages = []
if request.system_prompt:
messages.append({"role": "system", "content": request.system_prompt})
messages.append({"role": "user", "content": request.message})
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens,
)
except RateLimitError:
raise HTTPException(status_code=429, detail="Rate limit exceeded. Try again in 60 seconds.")
except APIError as e:
raise HTTPException(status_code=502, detail=f"Upstream API error: {e.message}")
usage = response.usage
cost = (usage.prompt_tokens / 1e6 * 0.15) + (usage.completion_tokens / 1e6 * 0.60)
return ChatResponse(
response=response.choices[0].message.content,
model="gpt-4o-mini",
prompt_tokens=usage.prompt_tokens,
completion_tokens=usage.completion_tokens,
cost_usd=round(cost, 8),
)
@app.get("/health")
async def health():
return {"status": "ok"}
# Run with: uvicorn main:app --reload
API key authentication¶
import os
import secrets
from fastapi import FastAPI, HTTPException, Security
from fastapi.security import APIKeyHeader
app = FastAPI()
API_KEY_HEADER = APIKeyHeader(name="X-API-Key", auto_error=False)
VALID_API_KEYS = {
os.getenv("API_KEY_1", "dev-key-001"): {"user": "user1", "tier": "free", "rpm": 10},
os.getenv("API_KEY_2", "dev-key-002"): {"user": "user2", "tier": "pro", "rpm": 100},
}
def verify_api_key(api_key: str = Security(API_KEY_HEADER)) -> dict:
if not api_key:
raise HTTPException(status_code=401, detail="API key required. Pass X-API-Key header.")
if api_key not in VALID_API_KEYS:
raise HTTPException(status_code=403, detail="Invalid API key.")
return VALID_API_KEYS[api_key]
from fastapi import Depends
@app.post("/chat")
async def chat(request: dict, user_info: dict = Depends(verify_api_key)):
# user_info contains the authenticated user's metadata
return {"user": user_info["user"], "tier": user_info["tier"], "message": "authenticated"}
Rate limiting with a sliding window¶
import time
from collections import defaultdict, deque
from fastapi import HTTPException, Depends
class RateLimiter:
def __init__(self):
self._windows: dict[str, deque] = defaultdict(deque)
def check(self, key: str, limit: int, window_seconds: int = 60) -> None:
now = time.time()
window = self._windows[key]
# Remove expired timestamps
while window and window[0] < now - window_seconds:
window.popleft()
if len(window) >= limit:
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded: {limit} requests per {window_seconds}s",
headers={"Retry-After": str(window_seconds)},
)
window.append(now)
rate_limiter = RateLimiter()
def check_rate_limit(user_info: dict = Depends(verify_api_key)) -> dict:
rpm = user_info.get("rpm", 10)
rate_limiter.check(user_info["user"], limit=rpm, window_seconds=60)
return user_info
@app.post("/chat-limited")
async def chat_limited(request: dict, user_info: dict = Depends(check_rate_limit)):
return {"message": "Request accepted", "user": user_info["user"]}
Structured error responses¶
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel
class ErrorResponse(BaseModel):
error: str
error_code: str
details: str = ""
app = FastAPI()
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
error=exc.detail,
error_code=f"HTTP_{exc.status_code}",
).model_dump(),
)
@app.exception_handler(Exception)
async def general_exception_handler(request: Request, exc: Exception):
return JSONResponse(
status_code=500,
content=ErrorResponse(
error="Internal server error",
error_code="INTERNAL_ERROR",
details=str(exc) if os.getenv("DEBUG") == "true" else "",
).model_dump(),
)
Return 429 for rate limits, 502 for upstream errors, 422 for validation
FastAPI automatically returns 422 for Pydantic validation failures (e.g., message too long). Return 429 with a Retry-After header for rate limits. Return 502 (bad gateway) rather than 500 when the OpenAI API itself fails — it signals a dependency failure, not a bug in your code.