Implementation — Function-Calling Data Extractor¶
Pydantic schemas¶
# schemas.py
from pydantic import BaseModel, Field
from typing import Optional
class InvoiceData(BaseModel):
vendor_name: Optional[str] = Field(None, description="Name of the company issuing the invoice")
invoice_number: Optional[str] = Field(None, description="Invoice ID or number")
invoice_date: Optional[str] = Field(None, description="Invoice issue date (ISO 8601 preferred)")
due_date: Optional[str] = Field(None, description="Payment due date")
total_amount: Optional[float] = Field(None, description="Total invoice amount as a number")
currency: Optional[str] = Field(None, description="Currency code (USD, EUR, GBP, etc.)")
line_items: list[dict] = Field(default_factory=list, description="List of {description, quantity, unit_price, total}")
confidence: float = Field(0.0, ge=0.0, le=1.0, description="Extraction confidence 0–1")
class SupportTicket(BaseModel):
customer_email: Optional[str] = Field(None, description="Customer's email address")
subject: Optional[str] = Field(None, description="Ticket subject line")
issue_category: Optional[str] = Field(None, description="One of: login, billing, performance, feature_request, bug, other")
urgency: Optional[str] = Field(None, description="One of: low, medium, high, critical")
browser: Optional[str] = Field(None, description="Browser name and version if mentioned")
os: Optional[str] = Field(None, description="Operating system if mentioned")
account_id: Optional[str] = Field(None, description="Customer account identifier if mentioned")
confidence: float = Field(0.0, ge=0.0, le=1.0)
SCHEMAS = {
"invoice": InvoiceData,
"support_ticket": SupportTicket,
}
Extractor with tool_use¶
# extractor.py
import os
import json
from openai import AsyncOpenAI
from pydantic import BaseModel, ValidationError
from schemas import SCHEMAS, InvoiceData, SupportTicket
aclient = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def build_tool(schema_class: type[BaseModel], tool_name: str) -> dict:
"""Build an OpenAI tool definition from a Pydantic model."""
return {
"type": "function",
"function": {
"name": tool_name,
"description": f"Extract structured {tool_name.replace('_', ' ')} data from the text",
"parameters": schema_class.model_json_schema(),
},
}
async def extract(text: str, schema_name: str) -> dict:
if schema_name not in SCHEMAS:
raise ValueError(f"Unknown schema: {schema_name}. Choose from: {list(SCHEMAS.keys())}")
schema_class = SCHEMAS[schema_name]
tool_name = f"extract_{schema_name}"
tool = build_tool(schema_class, tool_name)
# Step 1: Call LLM with tool
response = await aclient.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
f"Extract structured data from the text using the {tool_name} function. "
f"Set confidence based on how complete and unambiguous the extraction is: "
f"1.0 = all fields clearly present, 0.5 = some fields missing, 0.0 = could not extract."
),
},
{"role": "user", "content": text[:8000]}, # truncate to avoid context overflow
],
tools=[tool],
tool_choice={"type": "function", "function": {"name": tool_name}},
temperature=0.0,
)
# Step 2: Parse tool call arguments
choice = response.choices[0]
if not choice.message.tool_calls:
raise ValueError("LLM did not return a tool call")
tool_call = choice.message.tool_calls[0]
raw_args = json.loads(tool_call.function.arguments)
# Step 3: Validate with Pydantic
try:
validated = schema_class(**raw_args)
except ValidationError as e:
# Return what we got with confidence=0 and flag the errors
raw_args["confidence"] = 0.0
raw_args["validation_errors"] = e.errors()
return raw_args
return validated.model_dump()
FastAPI application¶
# app.py
import os
from contextlib import asynccontextmanager
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from pydantic import BaseModel
from extractor import extract
load_dotenv()
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
app = FastAPI(title="Data Extractor API", version="1.0.0", lifespan=lifespan)
class TextExtractRequest(BaseModel):
text: str
schema_name: str = "invoice"
@app.post("/extract/text")
async def extract_from_text(request: TextExtractRequest):
if len(request.text.strip()) < 20:
raise HTTPException(status_code=400, detail="Text too short for extraction.")
try:
result = await extract(request.text, request.schema_name)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
return result
@app.post("/extract/file")
async def extract_from_file(
file: UploadFile = File(...),
schema_name: str = Form("invoice"),
):
if file.size and file.size > 5 * 1024 * 1024:
raise HTTPException(status_code=413, detail="File too large (max 5MB)")
content = await file.read()
if file.filename.endswith(".pdf"):
try:
import pymupdf as fitz
except ImportError:
import fitz
doc = fitz.open(stream=content, filetype="pdf")
text = "\n\n".join(page.get_text() for page in doc)
elif file.filename.endswith((".txt", ".md")):
text = content.decode("utf-8")
else:
raise HTTPException(status_code=400, detail="Supported file types: .pdf, .txt, .md")
if len(text.strip()) < 20:
raise HTTPException(status_code=422, detail="Document appears empty or image-only.")
try:
result = await extract(text, schema_name)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
return result
@app.get("/schemas")
async def list_schemas():
from schemas import SCHEMAS
return {"available_schemas": list(SCHEMAS.keys())}
@app.get("/health")
async def health():
return {"status": "ok"}
Test it¶
uvicorn app:app --reload
# Extract from text
curl -X POST http://localhost:8000/extract/text \
-H "Content-Type: application/json" \
-d '{"text": "Invoice #INV-001 from Acme Corp, total $1,250 USD, due Jan 15 2025", "schema_name": "invoice"}'
# Extract from file
curl -X POST http://localhost:8000/extract/file \
-F "file=@test_data/invoices/sample.pdf" \
-F "schema_name=invoice"
Expected output:
{
"vendor_name": "Acme Corp",
"invoice_number": "INV-001",
"invoice_date": null,
"due_date": "2025-01-15",
"total_amount": 1250.0,
"currency": "USD",
"line_items": [],
"confidence": 0.7
}