ChromaDB¶

ChromaDB is the fastest way to get a vector database running. It requires no server, no API keys, and no configuration — ideal for local development, prototyping, and small-to-medium production deployments.

Learning objectives¶

Use both in-memory and persistent ChromaDB clients
Create collections with custom embedding functions
Perform filtered and unfiltered queries
Manage collections (add, update, delete, list)

Client modes¶

import chromadb
from chromadb.utils import embedding_functions

# 1. In-memory (data lost on process exit)
client = chromadb.Client()

# 2. Persistent (data saved to disk)
client = chromadb.PersistentClient(path="./my_chroma_db")

# 3. HTTP client (connect to a running ChromaDB server)
# client = chromadb.HttpClient(host="localhost", port=8000)

Creating collections¶

import os

# Built-in OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small"
)

# Built-in Sentence Transformers (no API key needed)
st_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create collection with distance metric
collection = client.create_collection(
    name="my_documents",
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"}   # "cosine", "l2", or "ip"
)

# Get existing or create new
collection = client.get_or_create_collection(
    name="my_documents",
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"}
)

print(f"Collection: {collection.name}, Count: {collection.count()}")

Adding documents¶

# Simple text add — ChromaDB calls the embedding function automatically
collection.add(
    ids=["doc-1", "doc-2", "doc-3"],
    documents=[
        "Python is a versatile programming language",
        "FastAPI is a modern web framework for Python",
        "The Eiffel Tower is 330 meters tall"
    ],
    metadatas=[
        {"source": "python-wiki", "category": "programming", "year": 2024},
        {"source": "fastapi-docs", "category": "programming", "year": 2024},
        {"source": "eiffel-wiki", "category": "tourism", "year": 2023}
    ]
)

# Add with pre-computed embeddings (skip the embedding function)
import numpy as np
my_embeddings = np.random.rand(2, 1536).tolist()  # your pre-computed embeddings

collection.add(
    ids=["custom-1", "custom-2"],
    embeddings=my_embeddings,
    documents=["document text 1", "document text 2"],
    metadatas=[{"source": "custom"}, {"source": "custom"}]
)

print(f"Total documents: {collection.count()}")

Querying¶

# Basic semantic search
results = collection.query(
    query_texts=["web development in Python"],
    n_results=2,
    include=["documents", "metadatas", "distances"]
)

for doc, meta, dist in zip(
    results["documents"][0],
    results["metadatas"][0],
    results["distances"][0]
):
    similarity = 1 - dist  # for cosine space: distance = 1 - similarity
    print(f"[{similarity:.3f}] {doc[:60]}... | source: {meta['source']}")

# Filtered query — only return programming documents
filtered_results = collection.query(
    query_texts=["Python frameworks"],
    n_results=3,
    where={"category": {"$eq": "programming"}},  # metadata filter
    include=["documents", "metadatas", "distances"]
)

# Filter by year range
recent_results = collection.query(
    query_texts=["programming"],
    n_results=5,
    where={"year": {"$gte": 2024}},
    include=["documents", "metadatas"]
)

# Full-text search (keyword search within documents)
keyword_results = collection.query(
    query_texts=["tower"],
    n_results=3,
    where_document={"$contains": "tall"},  # document must contain "tall"
    include=["documents", "metadatas"]
)

ChromaDB filter operators¶

Operator	Meaning	Example
`$eq`	Equal	`{"category": {"$eq": "tech"}}`
`$ne`	Not equal	`{"status": {"$ne": "deleted"}}`
`$gt`, `$gte`	Greater than (or equal)	`{"year": {"$gte": 2023}}`
`$lt`, `$lte`	Less than (or equal)	`{"score": {"$lt": 0.5}}`
`$in`	Value in list	`{"tag": {"$in": ["ml", "ai"]}}`
`$nin`	Value not in list	`{"tag": {"$nin": ["spam"]}}`
`$and`	Logical AND	`{"$and": [{"a": {"$eq": 1}}, {"b": {"$eq": 2}}]}`
`$or`	Logical OR	`{"$or": [{"cat": "ml"}, {"cat": "ai"}]}`

CRUD operations¶

# Get specific documents by ID
get_results = collection.get(
    ids=["doc-1", "doc-2"],
    include=["documents", "metadatas"]
)
print(get_results)

# Update document text and/or metadata
collection.update(
    ids=["doc-1"],
    documents=["Python is a high-level, versatile programming language created by Guido van Rossum"],
    metadatas=[{"source": "python-wiki", "category": "programming", "year": 2025}]
)

# Upsert (insert if new, update if exists)
collection.upsert(
    ids=["doc-1", "doc-new"],
    documents=["Updated Python doc", "Brand new document"],
    metadatas=[{"source": "wiki", "year": 2025}, {"source": "new", "year": 2025}]
)

# Delete by ID
collection.delete(ids=["custom-1", "custom-2"])

# Delete by metadata filter
collection.delete(where={"category": {"$eq": "tourism"}})

print(f"After deletes: {collection.count()}")

# List all collections
all_collections = client.list_collections()
print(f"Collections: {[c.name for c in all_collections]}")

# Delete a collection
client.delete_collection("my_documents")

Building a document store with ChromaDB¶

import os
from openai import OpenAI
import chromadb
from chromadb.utils import embedding_functions

class ChromaDocStore:
    def __init__(self, collection_name: str, persist_path: str = "./chroma"):
        self.client = chromadb.PersistentClient(path=persist_path)
        self.ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=os.getenv("OPENAI_API_KEY"),
            model_name="text-embedding-3-small"
        )
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.ef,
            metadata={"hnsw:space": "cosine"}
        )
        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    def add(self, docs: list[dict]) -> int:
        """Add documents. Each doc must have: id, text, and optional metadata."""
        self.collection.upsert(
            ids=[d["id"] for d in docs],
            documents=[d["text"] for d in docs],
            metadatas=[{k: v for k, v in d.items() if k not in {"id", "text"}} for d in docs]
        )
        return len(docs)

    def search(
        self,
        query: str,
        k: int = 5,
        filters: dict | None = None,
        min_similarity: float = 0.5
    ) -> list[dict]:
        kwargs = {"query_texts": [query], "n_results": k,
                  "include": ["documents", "metadatas", "distances"]}
        if filters:
            kwargs["where"] = filters

        results = self.collection.query(**kwargs)
        output = []
        for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
            sim = 1 - dist
            if sim >= min_similarity:
                output.append({"text": doc, "similarity": sim, **meta})
        return output

    def answer(self, question: str, k: int = 4, filters: dict | None = None) -> str:
        retrieved = self.search(question, k=k, filters=filters)
        if not retrieved:
            return "I don't have relevant information to answer this question."

        context = "\n\n".join(f"[{r.get('source', 'unknown')}]: {r['text']}" for r in retrieved)
        response = self.openai_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer based only on the context. Cite sources. Say 'I don't know' if not in context."},
                {"role": "user", "content": f"Context:\n{context}\n\nQ: {question}"}
            ],
            max_tokens=400, temperature=0.0
        )
        return response.choices[0].message.content

# Usage
store = ChromaDocStore("company_knowledge")
store.add([
    {"id": "p1", "text": "Remote work is permitted 3 days/week.", "source": "policy.pdf", "dept": "hr"},
    {"id": "p2", "text": "PTO accrues at 1.67 days/month.", "source": "policy.pdf", "dept": "hr"},
    {"id": "e1", "text": "Engineering uses Jira for project tracking.", "source": "eng-guide.pdf", "dept": "engineering"},
])

print(store.answer("How many remote days can I work?"))
print()
print(store.answer("What tools does engineering use?", filters={"dept": {"$eq": "engineering"}}))

Common mistakes¶

ChromaDB distance is NOT similarity

ChromaDB returns distances, not similarities. For cosine space: similarity = 1 - distance. A distance of 0 = identical, distance of 2 = opposite. Don't use the raw distance value as a quality threshold — convert to similarity first.

In-memory client loses data on restart

chromadb.Client() stores data in memory only. Always use chromadb.PersistentClient(path="...") for anything you need to keep between runs.

n_results cannot exceed the collection size

If you request n_results=5 but only have 3 documents, ChromaDB raises an error. Guard with min(k, collection.count()).

01-vector-db-overview | 03-pinecone