Vector Search¶

Brute-force cosine similarity works for thousands of documents. At millions, you need approximate nearest neighbor (ANN) search — trading a small amount of accuracy for 100–1000× faster retrieval. FAISS is the standard library for this.

Learning objectives¶

Implement brute-force similarity search from scratch
Build a FAISS index for approximate nearest neighbor search
Understand the IVF and HNSW index types
Add metadata filtering to vector search

Brute-force search (the baseline)¶

For small collections (< 100K docs), brute-force exact search is fine and requires no index.

import os
import numpy as np
from dataclasses import dataclass, field
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

@dataclass
class Document:
    id: str
    text: str
    metadata: dict = field(default_factory=dict)

class BruteForceVectorStore:
    def __init__(self):
        self.documents: list[Document] = []
        self.embeddings: np.ndarray | None = None

    def _embed(self, texts: list[str]) -> np.ndarray:
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=texts
        )
        return np.array([item.embedding for item in response.data])

    def add_documents(self, documents: list[Document]) -> None:
        texts = [doc.text for doc in documents]
        new_embeddings = self._embed(texts)

        if self.embeddings is None:
            self.embeddings = new_embeddings
        else:
            self.embeddings = np.vstack([self.embeddings, new_embeddings])

        self.documents.extend(documents)
        print(f"Index size: {len(self.documents)} documents")

    def search(self, query: str, k: int = 5) -> list[tuple[Document, float]]:
        if self.embeddings is None or len(self.documents) == 0:
            return []

        query_emb = self._embed([query])[0]

        # Normalize for cosine similarity via dot product
        query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-8)
        doc_norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
        docs_normalized = self.embeddings / (doc_norms + 1e-8)

        scores = docs_normalized @ query_norm  # (N,)
        top_k_indices = np.argsort(scores)[::-1][:k]

        return [(self.documents[i], float(scores[i])) for i in top_k_indices]

# Example usage
store = BruteForceVectorStore()
store.add_documents([
    Document("1", "Python is a versatile programming language used for web, data science, and AI"),
    Document("2", "FastAPI is a modern, fast web framework for Python"),
    Document("3", "Pandas provides data structures and analysis tools for Python"),
    Document("4", "Neural networks learn by adjusting weights through backpropagation"),
    Document("5", "The Eiffel Tower stands 330 meters tall in Paris"),
    Document("6", "RAG combines retrieval with language model generation"),
])

results = store.search("what Python libraries exist for data analysis?", k=3)
for doc, score in results:
    print(f"  [{score:.3f}] {doc.text[:60]}...")

FAISS for scalable search¶

FAISS (Facebook AI Similarity Search) handles millions of vectors efficiently using approximate nearest neighbor algorithms.

import faiss
import numpy as np
import os
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_embeddings_batch(texts: list[str], model: str = "text-embedding-3-small") -> np.ndarray:
    response = client.embeddings.create(model=model, input=texts)
    embeddings = np.array([item.embedding for item in response.data], dtype=np.float32)
    # Normalize for inner product search (equivalent to cosine similarity)
    faiss.normalize_L2(embeddings)
    return embeddings

# Sample documents
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks with many layers",
    "Natural language processing handles text understanding",
    "Computer vision processes images and video",
    "Reinforcement learning trains agents through rewards",
    "Transfer learning reuses pretrained model weights",
    "BERT is a transformer model for text understanding",
    "GPT models generate text autoregressively",
]

# Build FAISS index
doc_embeddings = get_embeddings_batch(documents)
dimension = doc_embeddings.shape[1]  # 1536

# IndexFlatIP = Flat (brute-force) + Inner Product (cosine if normalized)
index = faiss.IndexFlatIP(dimension)
index.add(doc_embeddings)
print(f"Index contains {index.ntotal} vectors of dimension {dimension}")

# Search
def search_faiss(query: str, k: int = 3) -> list[tuple[str, float]]:
    query_emb = get_embeddings_batch([query])
    scores, indices = index.search(query_emb, k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx != -1:  # -1 means no result
            results.append((documents[idx], float(score)))
    return results

results = search_faiss("How do transformers understand language?", k=3)
for text, score in results:
    print(f"  [{score:.3f}] {text}")

Index types¶

FAISS offers several index types with different speed/accuracy tradeoffs:

# 1. IndexFlatIP — exact search, no compression
# Best for: < 100K vectors, when accuracy is critical
flat_index = faiss.IndexFlatIP(dimension)

# 2. IndexIVFFlat — inverted file index, approximate
# Splits space into 'nlist' Voronoi cells, searches 'nprobe' cells
# Best for: 100K–10M vectors
nlist = 100  # number of clusters (typical: sqrt(N))
ivf_index = faiss.IndexIVFFlat(faiss.IndexFlatIP(dimension), dimension, nlist, faiss.METRIC_INNER_PRODUCT)
ivf_index.train(doc_embeddings)  # must train before adding
ivf_index.add(doc_embeddings)
ivf_index.nprobe = 10  # search 10 clusters (higher = more accurate, slower)

# 3. IndexHNSWFlat — Hierarchical Navigable Small World graph
# No training required, very fast at query time, more memory
# Best for: general purpose, latency-sensitive applications
hnsw_index = faiss.IndexHNSWFlat(dimension, 32)  # 32 = M (connections per node)
hnsw_index.hnsw.efConstruction = 40  # build quality (higher = better, slower to build)
hnsw_index.add(doc_embeddings)
hnsw_index.hnsw.efSearch = 16  # query quality (higher = more accurate)

def benchmark_indices(query: str) -> None:
    import time
    query_emb = get_embeddings_batch([query])

    for name, idx in [("Flat", flat_index), ("IVF", ivf_index), ("HNSW", hnsw_index)]:
        start = time.perf_counter()
        scores, indices = idx.search(query_emb, 3)
        elapsed = (time.perf_counter() - start) * 1000
        print(f"{name}: {elapsed:.2f}ms, top result: {documents[indices[0][0]][:40]}...")

Index	Build time	Query time	Accuracy	Memory
`IndexFlatIP`	Fast	O(N)	100%	4 bytes × D × N
`IndexIVFFlat`	Medium	O(√N)	95–99%	4 bytes × D × N + overhead
`IndexHNSWFlat`	Slow	O(log N)	98–99%	~2× flat memory

Start with IndexFlatIP

For a course project or prototype with < 50K documents, IndexFlatIP is exact and simple. Move to IVF or HNSW only when query latency or memory becomes a constraint.

Persisting and loading an index¶

# Save index to disk
faiss.write_index(index, "my_index.faiss")

# Load later
loaded_index = faiss.read_index("my_index.faiss")
print(f"Loaded index with {loaded_index.ntotal} vectors")

# Save document store separately (FAISS only stores vectors, not metadata)
import json
with open("my_docs.json", "w") as f:
    json.dump(documents, f)

with open("my_docs.json") as f:
    loaded_docs = json.load(f)

Metadata filtering¶

FAISS doesn't support metadata filters natively — you filter after retrieval.

from dataclasses import dataclass, field

@dataclass
class IndexedDocument:
    id: str
    text: str
    embedding: np.ndarray
    metadata: dict = field(default_factory=dict)

class FilterableVectorStore:
    def __init__(self, dimension: int):
        self.index = faiss.IndexFlatIP(dimension)
        self.documents: list[IndexedDocument] = []

    def add(self, doc: IndexedDocument) -> None:
        emb = doc.embedding.reshape(1, -1).astype(np.float32)
        faiss.normalize_L2(emb)
        self.index.add(emb)
        self.documents.append(doc)

    def search(
        self,
        query_embedding: np.ndarray,
        k: int = 5,
        metadata_filter: dict | None = None,
        overretrieve_factor: int = 3
    ) -> list[tuple[IndexedDocument, float]]:
        # Retrieve more results to account for filtering
        retrieve_k = k * overretrieve_factor if metadata_filter else k

        query_emb = query_embedding.reshape(1, -1).astype(np.float32)
        faiss.normalize_L2(query_emb)
        scores, indices = self.index.search(query_emb, min(retrieve_k, self.index.ntotal))

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx == -1:
                continue
            doc = self.documents[idx]
            if metadata_filter:
                if not all(doc.metadata.get(k) == v for k, v in metadata_filter.items()):
                    continue
            results.append((doc, float(score)))
            if len(results) == k:
                break

        return results

# Example: filter by category
store = FilterableVectorStore(dimension=1536)
# Add docs with metadata...
# results = store.search(query_emb, k=5, metadata_filter={"category": "science", "year": 2024})

Overretrieve when filtering

If you retrieve k=5 and filter aggressively, you may end up with 0–2 results. Retrieve k × overretrieve_factor candidates before filtering. A factor of 3–10 is typical depending on how selective the filter is.

What's next¶

Brute-force and FAISS give you the retrieval layer. The next step is combining retrieval with generation in a full semantic search pipeline.