Cross-modal RAG: text<->image

Because epithre-embed places text and images in the same 4000-dim vector space, you can build search experiences that other providers need two separate models for. Use one index, one similarity function, one pipeline.

Pattern A: text query -> image hits

User types "kucing oren di kasur" and you return matching photos from your catalog.

import base64, httpx, numpy as np
from openai import OpenAI

client = OpenAI(api_key=EK, base_url="https://api.epithre.com/v1")

# One-time: embed your image catalog
def embed_image_file(path):
    b64 = base64.b64encode(open(path, "rb").read()).decode()
    r = client.embeddings.create(
        model="epithre-embed",
        input=[{"type": "image", "image": b64}],
    )
    return np.array(r.data[0].embedding, dtype=np.float32)

import glob
catalog = {p: embed_image_file(p) for p in glob.glob("photos/*.jpg")}

# Query time: embed text, cosine-rank images
def search(query: str, top_k: int = 5):
    r = client.embeddings.create(model="epithre-embed", input=[query])
    qvec = np.array(r.data[0].embedding, dtype=np.float32)
    scored = [(path, float(qvec @ ivec)) for path, ivec in catalog.items()]
    return sorted(scored, key=lambda x: -x[1])[:top_k]

for path, score in search("kucing oren tidur di kasur"):
    print(f"{score:.3f}  {path}")

Pattern B: image query -> text hits

Upload a product photo, find matching descriptions in your knowledge base.

descriptions = [
    "Kopi arabika single-origin dari Gayo, light roast, notes citrus dan bunga",
    "Teh hijau premium Jawa Barat, panen pagi, aroma rumput segar",
    "Cokelat dark 70% Sulawesi, single-estate, finish bitter-fruity",
]
text_vecs = [np.array(d.embedding) for d in
             client.embeddings.create(model="epithre-embed", input=descriptions).data]

def reverse_image_search(img_path: str, top_k: int = 3):
    b64 = base64.b64encode(open(img_path, "rb").read()).decode()
    r = client.embeddings.create(
        model="epithre-embed",
        input=[{"type": "image", "image": b64}],
    )
    qvec = np.array(r.data[0].embedding, dtype=np.float32)
    scored = [(d, float(qvec @ tv)) for d, tv in zip(descriptions, text_vecs)]
    return sorted(scored, key=lambda x: -x[1])[:top_k]

for desc, score in reverse_image_search("coffee_bag_photo.jpg"):
    print(f"{score:.3f}  {desc}")

Pattern C: unified hybrid index

Store text passages and images in one pgvector table. Queries from either modality hit both.

CREATE TABLE assets (
    id BIGSERIAL PRIMARY KEY,
    kind TEXT NOT NULL,           -- 'text' or 'image'
    payload TEXT,                 -- text content, or image storage path
    embedding halfvec(4000)
);

CREATE INDEX ON assets USING hnsw (embedding halfvec_cosine_ops);

-- Query: nearest neighbors across BOTH modalities
SELECT kind, payload, 1 - (embedding <=> $1::halfvec) AS sim
FROM assets ORDER BY embedding <=> $1::halfvec LIMIT 20;

Use cases that benefit from unified index:

Use case: support ticket triage

A worker who fields support tickets that mix text complaints and damaged-product photos:

# User uploads: "barang nyampe rusak, sebelahnya bocor"
# + photo of dented box

# 1. Embed both signals
text_vec = client.embeddings.create(
    model="epithre-embed",
    input=["barang nyampe rusak, sebelahnya bocor"],
).data[0].embedding

img_vec = client.embeddings.create(
    model="epithre-embed",
    input=[{"type": "image", "image": img_b64}],
).data[0].embedding

# 2. Average them (or use either, or both as separate queries)
import numpy as np
combined = (np.array(text_vec) + np.array(img_vec)) / 2
combined = combined / np.linalg.norm(combined)  # re-L2-normalize

# 3. Search past resolved tickets (text + photo combined index)
matches = pgvector_search(combined, top_k=5)
# matches include past similar complaints, with their resolutions

# 4. Feed to chat for triage suggestion
resp = client.chat.completions.create(
    model="epithre-omni",
    messages=[{"role": "user", "content":
        f"Kasus baru: 'barang rusak'.\nKasus serupa sebelumnya:\n{matches}\n\nUsul resolusi?"}],
)

Note on absolute scores

Cosine similarity between text and image vectors tends to be lower in absolute number than text-to-text. A "very relevant" cross-modal match might score 0.15-0.25; text-to-text might score 0.50-0.70 for similar relevance. Always use rank order, not absolute threshold.

See also