semantic_chunk_analyzer.py
- Pfad:
/var/www/scripts/pipeline/semantic_chunk_analyzer.py
- Namespace: pipeline
- Zeilen: 119 | Größe: 3,140 Bytes
- Geändert: 2025-12-25 18:29:01 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 75
- Dependencies: 0 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Issues 1
| Zeile |
Typ |
Beschreibung |
| - |
coupling |
Klasse hat 17 Dependencies (max: 15) |
Dependencies 17
- use sys
- use db.db
- use semantic_chunk.ANALYSIS_MODEL
- use semantic_chunk.BATCH_SIZE
- use semantic_chunk.ChunkRepository
- use semantic_chunk.ChunkSemantics
- use semantic_chunk.Entity
- use semantic_chunk.EntityExtractor
- use semantic_chunk.EntityRepository
- use semantic_chunk.OllamaService
- use semantic_chunk.Relation
- use semantic_chunk.RelationExtractor
- use semantic_chunk.SemanticChunkPipeline
- use semantic_chunk.SemanticsAnalyzer
- use semantic_chunk.SemanticsRepository
- use semantic_chunk.TaxonomyClassifier
- use semantic_chunk.TaxonomyRepository
Funktionen 1
Code
#!/usr/bin/env python3
"""
Semantic Chunk Analyzer - Chunk-Level Semantic Analysis Pipeline
Analysiert Chunks aus der ki_system.chunks Tabelle und befüllt:
- chunk_semantics (summary, keywords, sentiment, topics, language)
- entities (extrahierte Entitäten)
- chunk_entities (Verknüpfung Chunk <-> Entity)
- entity_relations (Beziehungen zwischen Entitäten)
- taxonomy_terms + chunk_taxonomy (Kategorisierung)
BACKWARD COMPATIBILITY WRAPPER - Logic moved to semantic_chunk/ package.
Usage:
python semantic_chunk_analyzer.py analyze [--limit N]
python semantic_chunk_analyzer.py status
python semantic_chunk_analyzer.py reset
"""
import sys
from db import db
# Re-export for backward compatibility
from semantic_chunk import (
ANALYSIS_MODEL,
BATCH_SIZE,
ChunkRepository,
ChunkSemantics,
Entity,
EntityExtractor,
EntityRepository,
OllamaService,
Relation,
RelationExtractor,
SemanticChunkPipeline,
SemanticsAnalyzer,
SemanticsRepository,
TaxonomyClassifier,
TaxonomyRepository,
)
__all__ = [
"ChunkSemantics",
"Entity",
"Relation",
"OllamaService",
"ANALYSIS_MODEL",
"SemanticsAnalyzer",
"EntityExtractor",
"RelationExtractor",
"TaxonomyClassifier",
"ChunkRepository",
"SemanticsRepository",
"EntityRepository",
"TaxonomyRepository",
"SemanticChunkPipeline",
"BATCH_SIZE",
]
def main():
"""CLI Entry Point."""
if len(sys.argv) < 2:
print(__doc__)
return
command = sys.argv[1].lower()
db.connect()
try:
pipeline = SemanticChunkPipeline()
if command == "analyze":
limit = BATCH_SIZE
if len(sys.argv) > 2 and sys.argv[2] == "--limit":
limit = int(sys.argv[3]) if len(sys.argv) > 3 else BATCH_SIZE
result = pipeline.run(limit)
print(f"\nErgebnis: {result['success']}/{result['processed']} erfolgreich")
elif command == "status":
stats = pipeline.status()
print(f"Chunks gesamt: {stats['total']}")
print(f"Analysiert: {stats['analyzed']}")
print(f"Entitäten: {stats['entities']}")
print(f"Ausstehend: {stats['total'] - stats['analyzed']}")
elif command == "reset":
confirm = input("Alle Semantik-Daten löschen? (ja/nein): ")
if confirm.lower() == "ja":
for table in [
"chunk_semantics",
"chunk_entities",
"chunk_taxonomy",
"entity_relations",
"entities",
"taxonomy_terms",
]:
db.execute(f"TRUNCATE TABLE {table}")
db.commit()
print("Alle Semantik-Tabellen geleert.")
else:
print(f"Unbekannter Befehl: {command}")
print(__doc__)
except Exception as e:
db.log("ERROR", f"Pipeline error: {e}")
print(f"Fehler: {e}")
raise
finally:
db.disconnect()
if __name__ == "__main__":
main()