Advanced Features — Function-Calling Data Extractor¶
Retry on low confidence¶
When the extraction returns a low confidence score, retry with a more detailed prompt:
async def extract_with_retry(text: str, schema_name: str, min_confidence: float = 0.6) -> dict:
result = await extract(text, schema_name)
if result.get("confidence", 0) < min_confidence:
# Retry with a more explicit prompt
from extractor import aclient, build_tool, SCHEMAS
schema_class = SCHEMAS[schema_name]
tool = build_tool(schema_class, f"extract_{schema_name}")
response = await aclient.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Extract EVERY piece of structured information you can find. "
"Look carefully for amounts, dates, names, IDs, and categories. "
"If a value is ambiguous, make your best inference and set confidence accordingly."
),
},
{"role": "user", "content": text[:8000]},
],
tools=[tool],
tool_choice={"type": "function", "function": {"name": f"extract_{schema_name}"}},
temperature=0.0,
)
import json
from pydantic import ValidationError
raw = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
try:
validated = schema_class(**raw)
return validated.model_dump()
except ValidationError:
return raw
return result
Adding a new schema¶
To add a job description extractor, add to schemas.py:
class JobDescription(BaseModel):
job_title: Optional[str] = Field(None, description="Exact job title")
company_name: Optional[str] = Field(None, description="Hiring company name")
location: Optional[str] = Field(None, description="Job location (city, state, remote)")
employment_type: Optional[str] = Field(None, description="full-time, part-time, contract, freelance")
salary_min: Optional[float] = Field(None, description="Minimum salary if range given")
salary_max: Optional[float] = Field(None, description="Maximum salary if range given")
required_skills: list[str] = Field(default_factory=list, description="Required technical skills")
years_experience: Optional[int] = Field(None, description="Minimum years of experience required")
confidence: float = Field(0.0, ge=0.0, le=1.0)
# Register it
SCHEMAS["job_description"] = JobDescription
No other changes needed — the tool is built dynamically from the Pydantic schema.
Batch extraction¶
For processing many documents, use the OpenAI Batch API or async parallel extraction:
import asyncio
async def batch_extract(texts: list[str], schema_name: str, max_concurrent: int = 5) -> list[dict]:
semaphore = asyncio.Semaphore(max_concurrent)
async def extract_one(text: str) -> dict:
async with semaphore:
try:
return await extract(text, schema_name)
except Exception as e:
return {"error": str(e), "confidence": 0.0}
return await asyncio.gather(*[extract_one(t) for t in texts])
Field normalization¶
Standardize inconsistent date and currency formats post-extraction:
from datetime import datetime
import re
def normalize_date(date_str: str | None) -> str | None:
if not date_str:
return None
for fmt in ["%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y", "%B %d, %Y", "%b %d, %Y"]:
try:
return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
except ValueError:
continue
return date_str # Return as-is if we can't parse
def normalize_amount(amount) -> float | None:
if amount is None:
return None
if isinstance(amount, (int, float)):
return float(amount)
# Handle strings like "$1,250.00" or "1.250,00"
cleaned = re.sub(r"[^\d.]", "", str(amount).replace(",", ""))
try:
return float(cleaned)
except ValueError:
return None