Practice Exercises — OpenAI and Anthropic APIs¶
Three levels: warm-up modifies a single parameter, main builds a mini-pipeline, stretch handles failure cases.
Exercise 1 — Streaming vs. non-streaming latency (Warm-up)¶
Measure the time-to-first-token difference between streaming and non-streaming for the same prompt.
import os
import time
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
PROMPT = "Write a 200-word explanation of how neural networks learn."
def measure_non_streaming() -> tuple[float, str]:
start = time.perf_counter()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": PROMPT}],
max_tokens=300
)
elapsed = time.perf_counter() - start
return elapsed, response.choices[0].message.content
def measure_streaming() -> tuple[float, str]:
start = time.perf_counter()
first_token_time = None
full_text = ""
with client.chat.completions.stream(
model="gpt-4o",
messages=[{"role": "user", "content": PROMPT}],
max_tokens=300
) as stream:
for text in stream.text_stream:
if first_token_time is None:
first_token_time = time.perf_counter() - start
full_text += text
total_time = time.perf_counter() - start
return first_token_time, total_time, full_text
non_stream_time, _ = measure_non_streaming()
first_token, total_time, _ = measure_streaming()
print(f"Non-streaming total time: {non_stream_time:.2f}s")
print(f"Streaming time-to-first-token: {first_token:.2f}s")
print(f"Streaming total time: {total_time:.2f}s")
Expected result: Streaming time-to-first-token should be 0.5–1.5s while non-streaming total is 2–5s. This is why chat interfaces feel faster even though total generation time is similar.
Exercise 2 — Multi-provider comparison (Main)¶
Send the same prompt to GPT-4o and Claude Sonnet 4.6, compare responses and cost.
import os
from openai import OpenAI
from anthropic import Anthropic
from dataclasses import dataclass
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
anthropic_client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
@dataclass
class ComparisonResult:
prompt: str
openai_response: str
openai_cost_usd: float
anthropic_response: str
anthropic_cost_usd: float
def compare_providers(prompt: str, system: str = "") -> ComparisonResult:
messages = [{"role": "user", "content": prompt}]
# OpenAI
oai_kwargs = {"model": "gpt-4o", "messages": messages, "max_tokens": 300}
if system:
oai_kwargs["messages"] = [{"role": "system", "content": system}] + messages
oai_resp = openai_client.chat.completions.create(**oai_kwargs)
oai_text = oai_resp.choices[0].message.content
oai_cost = (oai_resp.usage.prompt_tokens * 2.50 + oai_resp.usage.completion_tokens * 10.00) / 1_000_000
# Anthropic
ant_kwargs = {"model": "claude-sonnet-4-6", "max_tokens": 300, "messages": messages}
if system:
ant_kwargs["system"] = system
ant_resp = anthropic_client.messages.create(**ant_kwargs)
ant_text = ant_resp.content[0].text
ant_cost = (ant_resp.usage.input_tokens * 3.00 + ant_resp.usage.output_tokens * 15.00) / 1_000_000
return ComparisonResult(
prompt=prompt,
openai_response=oai_text,
openai_cost_usd=oai_cost,
anthropic_response=ant_text,
anthropic_cost_usd=ant_cost
)
result = compare_providers(
prompt="What are three non-obvious risks of using LLMs in production systems?",
system="You are a senior ML engineer with production deployment experience."
)
print("=== GPT-4o ===")
print(result.openai_response)
print(f"Cost: ${result.openai_cost_usd:.6f}\n")
print("=== Claude Sonnet 4.6 ===")
print(result.anthropic_response)
print(f"Cost: ${result.anthropic_cost_usd:.6f}\n")
print(f"OpenAI vs Anthropic cost ratio: {result.openai_cost_usd / result.anthropic_cost_usd:.2f}x")
Extension: Run the same comparison on 10 different prompts and calculate average cost per response. Which model is cheaper per useful output token?
Exercise 3 — Tool-using research assistant (Main)¶
Build an assistant that uses tools to look up information before answering.
import os
import json
from datetime import datetime
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Simulate a knowledge base and calculator
KNOWLEDGE_BASE = {
"transformer": "A transformer is a neural network architecture based on self-attention, introduced in 'Attention Is All You Need' (2017).",
"rag": "RAG (Retrieval-Augmented Generation) combines a retrieval system with a language model to answer questions using external documents.",
"fine-tuning": "Fine-tuning adapts a pretrained model to a specific task using labeled examples, updating model weights.",
"langchain": "LangChain is a framework for building LLM applications with composable chains, agents, and memory."
}
def search_docs(query: str) -> dict:
query_lower = query.lower()
results = []
for key, content in KNOWLEDGE_BASE.items():
if key in query_lower or any(word in content.lower() for word in query_lower.split()):
results.append({"term": key, "definition": content})
return {"query": query, "results": results or [{"term": "not found", "definition": "No matching documentation found."}]}
def calculate(expression: str) -> dict:
try:
# Only allow safe math expressions
allowed_chars = set("0123456789+-*/()., ")
if not all(c in allowed_chars for c in expression):
return {"error": "Invalid expression — only arithmetic allowed"}
result = eval(expression) # safe because we validated chars above
return {"expression": expression, "result": result}
except Exception as e:
return {"error": str(e)}
TOOLS = [
{
"type": "function",
"function": {
"name": "search_docs",
"description": "Search the technical documentation for information about AI/ML concepts.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
"required": ["query"],
"additionalProperties": False
},
"strict": True
}
},
{
"type": "function",
"function": {
"name": "calculate",
"description": "Evaluate a mathematical expression.",
"parameters": {
"type": "object",
"properties": {"expression": {"type": "string"}},
"required": ["expression"],
"additionalProperties": False
},
"strict": True
}
}
]
REGISTRY = {"search_docs": search_docs, "calculate": calculate}
def research_assistant(question: str) -> str:
messages = [
{"role": "system", "content": "You are a helpful AI research assistant. Always search the docs before answering technical questions."},
{"role": "user", "content": question}
]
while True:
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=TOOLS,
tool_choice="auto"
)
if response.choices[0].finish_reason != "tool_calls":
return response.choices[0].message.content
assistant_msg = response.choices[0].message
messages.append(assistant_msg)
for tc in assistant_msg.tool_calls:
fn = REGISTRY[tc.function.name]
args = json.loads(tc.function.arguments)
result = fn(**args)
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"content": json.dumps(result)
})
# Test it
print(research_assistant("What is RAG and how does it differ from fine-tuning?"))
print("\n---\n")
print(research_assistant("If I have 1000 documents and each costs $0.003 to embed, what's the total cost?"))
Exercise 4 — Prompt caching cost analysis (Stretch)¶
Measure the actual cost savings from Anthropic prompt caching with a long system prompt.
import os
from anthropic import Anthropic
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# Generate a ~2000-token system prompt
LONG_SYSTEM = """
You are an expert data scientist specializing in machine learning and LLM systems.
Your expertise includes:
1. MACHINE LEARNING FUNDAMENTALS
- Supervised, unsupervised, and reinforcement learning
- Gradient descent, backpropagation, and optimization algorithms
- Regularization: L1/L2, dropout, early stopping
- Ensemble methods: bagging, boosting, stacking
- Model selection: cross-validation, hyperparameter tuning
2. DEEP LEARNING ARCHITECTURES
- Convolutional Neural Networks (CNNs) for computer vision
- Recurrent Neural Networks (RNNs) and LSTMs for sequences
- Transformers and attention mechanisms
- Diffusion models for generation
- Graph Neural Networks for relational data
3. LLM SYSTEMS AND APPLICATIONS
- Prompt engineering and optimization
- Retrieval-Augmented Generation (RAG)
- Fine-tuning with LoRA and QLoRA
- Evaluation frameworks: RAGAS, LLM-as-judge
- Agent architectures and tool use
- LangChain, LangGraph, and agent frameworks
4. PRODUCTION DEPLOYMENT
- API design for ML services
- Model serving: FastAPI, Triton, vLLM
- Monitoring: data drift, concept drift, performance metrics
- Cost optimization: caching, quantization, batching
- A/B testing for model evaluation
Always provide precise, actionable answers with code examples when relevant.
Be direct about tradeoffs and uncertainty. Cite specific techniques by name.
""" * 3 # Triple it to ensure > 1024 tokens
QUESTIONS = [
"Explain the key difference between LoRA and full fine-tuning.",
"What metrics should I track in a RAG evaluation pipeline?",
"How do I implement sliding window attention for long sequences?",
"When should I use ensemble methods vs. a single large model?"
]
def run_with_caching(question: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=200,
system=[
{
"type": "text",
"text": LONG_SYSTEM,
"cache_control": {"type": "ephemeral"}
}
],
messages=[{"role": "user", "content": question}]
)
return {
"question": question[:50] + "...",
"cache_write": response.usage.cache_creation_input_tokens,
"cache_read": response.usage.cache_read_input_tokens,
"input": response.usage.input_tokens,
"output": response.usage.output_tokens
}
# First call — cache miss (cache_write > 0)
print("Running 4 questions with prompt caching...\n")
total_cost_without_cache = 0
total_cost_with_cache = 0
for i, q in enumerate(QUESTIONS):
stats = run_with_caching(q)
# Cost without cache
cost_no_cache = (stats["input"] * 3.00 + stats["output"] * 15.00) / 1_000_000
# Cost with cache (write at 1.25x, read at 0.1x)
cost_cache = (stats["cache_write"] * 3.75 + stats["cache_read"] * 0.30 + stats["output"] * 15.00) / 1_000_000
total_cost_without_cache += cost_no_cache
total_cost_with_cache += cost_cache
print(f"Q{i+1}: {stats['question']}")
print(f" Cache write: {stats['cache_write']:,} | Cache read: {stats['cache_read']:,}")
print(f" Cost without cache: ${cost_no_cache:.6f}")
print(f" Cost with cache: ${cost_cache:.6f}")
print()
savings = (total_cost_without_cache - total_cost_with_cache) / total_cost_without_cache * 100
print(f"Total without cache: ${total_cost_without_cache:.6f}")
print(f"Total with cache: ${total_cost_with_cache:.6f}")
print(f"Savings: {savings:.1f}%")
Expected result: After the first call (cache write), subsequent calls should show ~80–90% cost reduction on the system prompt tokens.
Exercise 5 — Error handling under rate limits (Stretch)¶
Simulate rate limit errors and verify your retry logic handles them correctly.
import os
import time
import random
from unittest.mock import patch, MagicMock
from openai import RateLimitError, OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
call_count = 0
def flaky_completion(prompt: str, fail_first_n: int = 2) -> str:
"""Simulate a function that fails with rate limits for the first N calls."""
global call_count
call_count += 1
if call_count <= fail_first_n:
# Simulate a 429 response
mock_response = MagicMock()
mock_response.status_code = 429
mock_response.headers = {"retry-after": "1"}
raise RateLimitError(
message="Rate limit exceeded",
response=mock_response,
body={"error": {"message": "Rate limit exceeded", "type": "rate_limit_error"}}
)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
max_tokens=50
)
return response.choices[0].message.content
def with_retry(fn, max_retries: int = 5) -> str:
for attempt in range(max_retries):
try:
return fn()
except RateLimitError as e:
if attempt == max_retries - 1:
raise
wait = 2 ** attempt + random.uniform(0, 0.5)
print(f"Rate limit on attempt {attempt + 1}, waiting {wait:.1f}s...")
time.sleep(wait)
raise RuntimeError("Max retries exceeded")
# Test: should succeed after 2 failures
call_count = 0
result = with_retry(lambda: flaky_completion("Say 'success' and nothing else.", fail_first_n=2))
print(f"Final result: {result}")
print(f"Total API calls made: {call_count}")
assert call_count == 3, f"Expected 3 calls (2 failures + 1 success), got {call_count}"
print("Test passed!")