rag_context.py
- Pfad:
/var/www/scripts/pipeline/generators/rag_context.py - Namespace: pipeline
- Zeilen: 129 | Größe: 3,897 Bytes
- Geändert: 2025-12-25 13:55:50 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 3
- use sys
- use db.db
- use embed.search_similar
Funktionen 3
-
get_rag_context()Zeile 13 -
get_semantic_context()Zeile 35 -
get_taxonomy_context()Zeile 96
Code
"""
RAG Context Functions - Load context from Qdrant and semantic data.
"""
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
from embed import search_similar
def get_rag_context(briefing: str, collection: str = "documents", limit: int = 5) -> list[dict]:
"""
Get relevant context from Qdrant based on briefing.
Returns list of chunks with content and metadata.
"""
results = search_similar(briefing, collection=collection, limit=limit)
context_items = []
for result in results:
context_items.append(
{
"content": result["payload"].get("content", ""),
"source": result["payload"].get("document_title", "Unknown"),
"score": round(result["score"], 4),
"chunk_id": result["payload"].get("chunk_id"),
"document_id": result["payload"].get("document_id"),
}
)
return context_items
def get_semantic_context(chunk_ids: list[int]) -> dict:
"""
Load entities and relations based on chunk_ids.
Uses the chunk_entities junction table to find relevant entities,
then loads relations between those entities.
Args:
chunk_ids: List of chunk IDs from RAG context
Returns:
dict with 'entities' and 'relations' lists
"""
if not chunk_ids:
return {"entities": [], "relations": []}
# Filter out None values
chunk_ids = [cid for cid in chunk_ids if cid is not None]
if not chunk_ids:
return {"entities": [], "relations": []}
placeholders = ", ".join(["%s"] * len(chunk_ids))
# Load entities via chunk_entities
cursor = db.execute(
f"""SELECT DISTINCT e.id, e.name, e.type, e.description,
AVG(ce.relevance_score) as relevance
FROM chunk_entities ce
JOIN entities e ON ce.entity_id = e.id
WHERE ce.chunk_id IN ({placeholders})
GROUP BY e.id, e.name, e.type, e.description
ORDER BY relevance DESC
LIMIT 10""",
tuple(chunk_ids),
)
entities = cursor.fetchall()
cursor.close()
if not entities:
return {"entities": [], "relations": []}
# Get entity IDs for relation lookup
entity_ids = [e["id"] for e in entities]
entity_placeholders = ", ".join(["%s"] * len(entity_ids))
# Load relations between found entities
cursor = db.execute(
f"""SELECT e1.name as source, er.relation_type, e2.name as target
FROM entity_relations er
JOIN entities e1 ON er.source_entity_id = e1.id
JOIN entities e2 ON er.target_entity_id = e2.id
WHERE e1.id IN ({entity_placeholders}) AND e2.id IN ({entity_placeholders})
LIMIT 15""",
tuple(entity_ids) + tuple(entity_ids),
)
relations = cursor.fetchall()
cursor.close()
return {"entities": entities, "relations": relations}
def get_taxonomy_context(document_ids: list[int]) -> list[dict]:
"""
Load taxonomy terms for documents.
Args:
document_ids: List of document IDs from RAG context
Returns:
List of taxonomy term dicts with name, slug, confidence
"""
if not document_ids:
return []
# Filter out None values
document_ids = [did for did in document_ids if did is not None]
if not document_ids:
return []
placeholders = ", ".join(["%s"] * len(document_ids))
cursor = db.execute(
f"""SELECT DISTINCT tt.name, tt.slug, MAX(dt.confidence) as confidence
FROM document_taxonomy dt
JOIN taxonomy_terms tt ON dt.taxonomy_term_id = tt.id
WHERE dt.document_id IN ({placeholders})
GROUP BY tt.id, tt.name, tt.slug
ORDER BY confidence DESC""",
tuple(document_ids),
)
taxonomy = cursor.fetchall()
cursor.close()
return taxonomy