semantic_chunk_analyzer.py

Code Hygiene Score: 75

Issues 1

Zeile Typ Beschreibung
- coupling Klasse hat 17 Dependencies (max: 15)

Dependencies 17

Funktionen 1

Code

#!/usr/bin/env python3
"""
Semantic Chunk Analyzer - Chunk-Level Semantic Analysis Pipeline

Analysiert Chunks aus der ki_system.chunks Tabelle und befüllt:
- chunk_semantics (summary, keywords, sentiment, topics, language)
- entities (extrahierte Entitäten)
- chunk_entities (Verknüpfung Chunk <-> Entity)
- entity_relations (Beziehungen zwischen Entitäten)
- taxonomy_terms + chunk_taxonomy (Kategorisierung)

BACKWARD COMPATIBILITY WRAPPER - Logic moved to semantic_chunk/ package.

Usage:
    python semantic_chunk_analyzer.py analyze [--limit N]
    python semantic_chunk_analyzer.py status
    python semantic_chunk_analyzer.py reset
"""

import sys

from db import db

# Re-export for backward compatibility
from semantic_chunk import (
    ANALYSIS_MODEL,
    BATCH_SIZE,
    ChunkRepository,
    ChunkSemantics,
    Entity,
    EntityExtractor,
    EntityRepository,
    OllamaService,
    Relation,
    RelationExtractor,
    SemanticChunkPipeline,
    SemanticsAnalyzer,
    SemanticsRepository,
    TaxonomyClassifier,
    TaxonomyRepository,
)

__all__ = [
    "ChunkSemantics",
    "Entity",
    "Relation",
    "OllamaService",
    "ANALYSIS_MODEL",
    "SemanticsAnalyzer",
    "EntityExtractor",
    "RelationExtractor",
    "TaxonomyClassifier",
    "ChunkRepository",
    "SemanticsRepository",
    "EntityRepository",
    "TaxonomyRepository",
    "SemanticChunkPipeline",
    "BATCH_SIZE",
]


def main():
    """CLI Entry Point."""
    if len(sys.argv) < 2:
        print(__doc__)
        return

    command = sys.argv[1].lower()

    db.connect()

    try:
        pipeline = SemanticChunkPipeline()

        if command == "analyze":
            limit = BATCH_SIZE
            if len(sys.argv) > 2 and sys.argv[2] == "--limit":
                limit = int(sys.argv[3]) if len(sys.argv) > 3 else BATCH_SIZE

            result = pipeline.run(limit)
            print(f"\nErgebnis: {result['success']}/{result['processed']} erfolgreich")

        elif command == "status":
            stats = pipeline.status()
            print(f"Chunks gesamt: {stats['total']}")
            print(f"Analysiert:    {stats['analyzed']}")
            print(f"Entitäten:     {stats['entities']}")
            print(f"Ausstehend:    {stats['total'] - stats['analyzed']}")

        elif command == "reset":
            confirm = input("Alle Semantik-Daten löschen? (ja/nein): ")
            if confirm.lower() == "ja":
                for table in [
                    "chunk_semantics",
                    "chunk_entities",
                    "chunk_taxonomy",
                    "entity_relations",
                    "entities",
                    "taxonomy_terms",
                ]:
                    db.execute(f"TRUNCATE TABLE {table}")
                    db.commit()
                print("Alle Semantik-Tabellen geleert.")

        else:
            print(f"Unbekannter Befehl: {command}")
            print(__doc__)

    except Exception as e:
        db.log("ERROR", f"Pipeline error: {e}")
        print(f"Fehler: {e}")
        raise
    finally:
        db.disconnect()


if __name__ == "__main__":
    main()
← Übersicht