FastAPI Wrappers¶

Wrapping your LLM pipeline in a FastAPI endpoint is the first step to making it accessible as a service. A production endpoint needs more than just calling the OpenAI API — it needs request validation, authentication, rate limiting, error handling, and structured responses that client code can depend on.

Learning objectives¶

Build a FastAPI app that wraps an LLM pipeline
Define Pydantic request and response models
Implement API key authentication with dependency injection
Add rate limiting and request size validation
Return proper HTTP status codes for LLM errors

Minimal working FastAPI LLM endpoint¶

# main.py
import os
from fastapi import FastAPI, HTTPException, Header
from pydantic import BaseModel, Field
from typing import Optional
from openai import OpenAI, APIError, RateLimitError

app = FastAPI(title="LLM API", version="1.0.0")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

class ChatRequest(BaseModel):
    message: str = Field(..., min_length=1, max_length=4000, description="User message")
    system_prompt: Optional[str] = Field(None, max_length=2000)
    temperature: float = Field(0.0, ge=0.0, le=2.0)
    max_tokens: int = Field(500, ge=1, le=4000)

class ChatResponse(BaseModel):
    response: str
    model: str
    prompt_tokens: int
    completion_tokens: int
    cost_usd: float

@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    messages = []
    if request.system_prompt:
        messages.append({"role": "system", "content": request.system_prompt})
    messages.append({"role": "user", "content": request.message})

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=request.temperature,
            max_tokens=request.max_tokens,
        )
    except RateLimitError:
        raise HTTPException(status_code=429, detail="Rate limit exceeded. Try again in 60 seconds.")
    except APIError as e:
        raise HTTPException(status_code=502, detail=f"Upstream API error: {e.message}")

    usage = response.usage
    cost = (usage.prompt_tokens / 1e6 * 0.15) + (usage.completion_tokens / 1e6 * 0.60)

    return ChatResponse(
        response=response.choices[0].message.content,
        model="gpt-4o-mini",
        prompt_tokens=usage.prompt_tokens,
        completion_tokens=usage.completion_tokens,
        cost_usd=round(cost, 8),
    )

@app.get("/health")
async def health():
    return {"status": "ok"}

# Run with: uvicorn main:app --reload

API key authentication¶

import os
import secrets
from fastapi import FastAPI, HTTPException, Security
from fastapi.security import APIKeyHeader

app = FastAPI()
API_KEY_HEADER = APIKeyHeader(name="X-API-Key", auto_error=False)

VALID_API_KEYS = {
    os.getenv("API_KEY_1", "dev-key-001"): {"user": "user1", "tier": "free", "rpm": 10},
    os.getenv("API_KEY_2", "dev-key-002"): {"user": "user2", "tier": "pro", "rpm": 100},
}

def verify_api_key(api_key: str = Security(API_KEY_HEADER)) -> dict:
    if not api_key:
        raise HTTPException(status_code=401, detail="API key required. Pass X-API-Key header.")
    if api_key not in VALID_API_KEYS:
        raise HTTPException(status_code=403, detail="Invalid API key.")
    return VALID_API_KEYS[api_key]

from fastapi import Depends

@app.post("/chat")
async def chat(request: dict, user_info: dict = Depends(verify_api_key)):
    # user_info contains the authenticated user's metadata
    return {"user": user_info["user"], "tier": user_info["tier"], "message": "authenticated"}

Rate limiting with a sliding window¶

import time
from collections import defaultdict, deque
from fastapi import HTTPException, Depends

class RateLimiter:
    def __init__(self):
        self._windows: dict[str, deque] = defaultdict(deque)

    def check(self, key: str, limit: int, window_seconds: int = 60) -> None:
        now = time.time()
        window = self._windows[key]

        # Remove expired timestamps
        while window and window[0] < now - window_seconds:
            window.popleft()

        if len(window) >= limit:
            raise HTTPException(
                status_code=429,
                detail=f"Rate limit exceeded: {limit} requests per {window_seconds}s",
                headers={"Retry-After": str(window_seconds)},
            )
        window.append(now)

rate_limiter = RateLimiter()

def check_rate_limit(user_info: dict = Depends(verify_api_key)) -> dict:
    rpm = user_info.get("rpm", 10)
    rate_limiter.check(user_info["user"], limit=rpm, window_seconds=60)
    return user_info

@app.post("/chat-limited")
async def chat_limited(request: dict, user_info: dict = Depends(check_rate_limit)):
    return {"message": "Request accepted", "user": user_info["user"]}

Structured error responses¶

from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel

class ErrorResponse(BaseModel):
    error: str
    error_code: str
    details: str = ""

app = FastAPI()

@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
    return JSONResponse(
        status_code=exc.status_code,
        content=ErrorResponse(
            error=exc.detail,
            error_code=f"HTTP_{exc.status_code}",
        ).model_dump(),
    )

@app.exception_handler(Exception)
async def general_exception_handler(request: Request, exc: Exception):
    return JSONResponse(
        status_code=500,
        content=ErrorResponse(
            error="Internal server error",
            error_code="INTERNAL_ERROR",
            details=str(exc) if os.getenv("DEBUG") == "true" else "",
        ).model_dump(),
    )

Return 429 for rate limits, 502 for upstream errors, 422 for validation

FastAPI automatically returns 422 for Pydantic validation failures (e.g., message too long). Return 429 with a Retry-After header for rate limits. Return 502 (bad gateway) rather than 500 when the OpenAI API itself fails — it signals a dependency failure, not a bug in your code.

00-agenda | 02-streaming-responses