Search that finds the doc titled "outage at 3am" when the user typed "server fell over", with the index sitting in your own Postgres instead of a managed store you can't inspect. Ringside does the embedding and pgvector does the ranking. Every call carries an FC-Customer, so the cost lands against that tenant in /app/margin.

If you'd rather not own the index at all, file_search and Vector Stores do this managed. This recipe is the bring-your-own version.

What you need

An FC API key with scope api:chat
Postgres 15+ with the pgvector extension installed
pip install openai psycopg[binary]

Full code

python
# rag_index.py
import os, psycopg
from openai import OpenAI

client = OpenAI(api_key=os.environ["FC_API_KEY"], base_url="https://api.fightclub.pro/v1")
DB = os.environ["DATABASE_URL"]
CUSTOMER = "cus_shared_rag"

SCHEMA_SQL = """
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS docs (
  id bigserial PRIMARY KEY,
  title text NOT NULL,
  body text NOT NULL,
  embedding vector(1536) NOT NULL
);
CREATE INDEX IF NOT EXISTS docs_embedding_idx ON docs USING hnsw (embedding vector_cosine_ops);
"""


def init_schema():
    with psycopg.connect(DB) as conn, conn.cursor() as cur:
        cur.execute(SCHEMA_SQL)


def embed(texts: list[str]) -> list[list[float]]:
    """Batch up to 2048 strings per call (the input-array cap), 100k chars each."""
    r = client.embeddings.create(
        model="fc:openai/text-embedding-3-small",
        input=texts,
        extra_headers={"FC-Customer": CUSTOMER, "FC-Tag": "rag.embed"},
    )
    return [d.embedding for d in r.data]


def index_corpus(rows: list[tuple[str, str]]):
    """rows = [(title, body), ...]"""
    vectors = embed([body for _, body in rows])
    with psycopg.connect(DB) as conn, conn.cursor() as cur:
        for (title, body), vec in zip(rows, vectors):
            cur.execute(
                "INSERT INTO docs (title, body, embedding) VALUES (%s, %s, %s)",
                (title, body, vec),
            )


def search(question: str, k: int = 5) -> list[dict]:
    q_vec = embed([question])[0]
    with psycopg.connect(DB) as conn, conn.cursor() as cur:
        cur.execute(
            """
            SELECT id, title, body, 1 - (embedding <=> %s::vector) AS score
            FROM docs
            ORDER BY embedding <=> %s::vector
            LIMIT %s
            """,
            (q_vec, q_vec, k),
        )
        return [
            {"id": r[0], "title": r[1], "body": r[2], "score": r[3]}
            for r in cur.fetchall()
        ]


def answer(question: str) -> str:
    """Retrieve top 3, stuff into a chat prompt, return the answer."""
    hits = search(question, k=3)
    context = "\n\n".join(f"## {h['title']}\n{h['body']}" for h in hits)
    r = client.chat.completions.create(
        model="fc:openai/gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer only from the context. Cite titles."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"},
        ],
        extra_headers={"FC-Customer": CUSTOMER, "FC-Tag": "rag.answer"},
    )
    return r.choices[0].message.content


if __name__ == "__main__":
    init_schema()
    index_corpus([
        ("Client Tokens", "Client Tokens are short-lived JWTs scoped to one Customer..."),
        ("Budgets", "Every Customer has a monthly budget cap. Once hit, 402 customer_budget_exceeded."),
        ("Webhooks", "Ringside fires webhooks on 34 event types with HMAC-signed payloads."),
    ])
    print(answer("How do I limit per-user spend?"))

Walkthrough

The operator <=> is pgvector's cosine-distance operator (0 = identical, 2 = opposite). Because smaller is closer, ORDER BY embedding <=> q_vec LIMIT k gives you the top K. The 1 - distance expression inverts it to a similarity score so higher is better, which is nicer in UI.

hnsw is the recommended pgvector index for small-to-medium corpora (up to ~10M rows). For larger, consider IVFFlat with lists = sqrt(n_rows).

FC-Tag on both the embed and the chat calls lets you separate "retrieval embedding cost" from "answer completion cost" in /v1/usage?group_by=tag. This is the easiest way to decide whether your RAG tuning is helping.

Run it

bash
export FC_API_KEY=ko_0d7f2a91c4e35b86af10d2c7e94b6f3a5d81c02e7b4936af18d5c60e2a7f9b34
export DATABASE_URL=postgres://localhost/ragdemo
python rag_index.py

What's next

file_search with citations, the managed alternative to the index above
Overnight batch classifications
POST /v1/embeddings

Embeddings + pgvector search

What you need

Full code

Walkthrough

Run it

What's next