rag_context.py

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 3

Funktionen 3

Code

"""
RAG Context Functions - Load context from Qdrant and semantic data.
"""

import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db
from embed import search_similar


def get_rag_context(briefing: str, collection: str = "documents", limit: int = 5) -> list[dict]:
    """
    Get relevant context from Qdrant based on briefing.
    Returns list of chunks with content and metadata.
    """
    results = search_similar(briefing, collection=collection, limit=limit)

    context_items = []
    for result in results:
        context_items.append(
            {
                "content": result["payload"].get("content", ""),
                "source": result["payload"].get("document_title", "Unknown"),
                "score": round(result["score"], 4),
                "chunk_id": result["payload"].get("chunk_id"),
                "document_id": result["payload"].get("document_id"),
            }
        )

    return context_items


def get_semantic_context(chunk_ids: list[int]) -> dict:
    """
    Load entities and relations based on chunk_ids.

    Uses the chunk_entities junction table to find relevant entities,
    then loads relations between those entities.

    Args:
        chunk_ids: List of chunk IDs from RAG context

    Returns:
        dict with 'entities' and 'relations' lists
    """
    if not chunk_ids:
        return {"entities": [], "relations": []}

    # Filter out None values
    chunk_ids = [cid for cid in chunk_ids if cid is not None]
    if not chunk_ids:
        return {"entities": [], "relations": []}

    placeholders = ", ".join(["%s"] * len(chunk_ids))

    # Load entities via chunk_entities
    cursor = db.execute(
        f"""SELECT DISTINCT e.id, e.name, e.type, e.description,
              AVG(ce.relevance_score) as relevance
           FROM chunk_entities ce
           JOIN entities e ON ce.entity_id = e.id
           WHERE ce.chunk_id IN ({placeholders})
           GROUP BY e.id, e.name, e.type, e.description
           ORDER BY relevance DESC
           LIMIT 10""",
        tuple(chunk_ids),
    )
    entities = cursor.fetchall()
    cursor.close()

    if not entities:
        return {"entities": [], "relations": []}

    # Get entity IDs for relation lookup
    entity_ids = [e["id"] for e in entities]
    entity_placeholders = ", ".join(["%s"] * len(entity_ids))

    # Load relations between found entities
    cursor = db.execute(
        f"""SELECT e1.name as source, er.relation_type, e2.name as target
           FROM entity_relations er
           JOIN entities e1 ON er.source_entity_id = e1.id
           JOIN entities e2 ON er.target_entity_id = e2.id
           WHERE e1.id IN ({entity_placeholders}) AND e2.id IN ({entity_placeholders})
           LIMIT 15""",
        tuple(entity_ids) + tuple(entity_ids),
    )
    relations = cursor.fetchall()
    cursor.close()

    return {"entities": entities, "relations": relations}


def get_taxonomy_context(document_ids: list[int]) -> list[dict]:
    """
    Load taxonomy terms for documents.

    Args:
        document_ids: List of document IDs from RAG context

    Returns:
        List of taxonomy term dicts with name, slug, confidence
    """
    if not document_ids:
        return []

    # Filter out None values
    document_ids = [did for did in document_ids if did is not None]
    if not document_ids:
        return []

    placeholders = ", ".join(["%s"] * len(document_ids))

    cursor = db.execute(
        f"""SELECT DISTINCT tt.name, tt.slug, MAX(dt.confidence) as confidence
           FROM document_taxonomy dt
           JOIN taxonomy_terms tt ON dt.taxonomy_term_id = tt.id
           WHERE dt.document_id IN ({placeholders})
           GROUP BY tt.id, tt.name, tt.slug
           ORDER BY confidence DESC""",
        tuple(document_ids),
    )
    taxonomy = cursor.fetchall()
    cursor.close()

    return taxonomy
← Übersicht