{
"event": "PreToolUse",
"tool_name": "Read",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/migrate_semantics.py"
}
}
{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/step_semantic_extended.py",
"content": "\"\"\"\nExtended Semantic Steps for Scientific Pipeline v1.\n\nImplements 6 new step_types for Pipeline #5:\n1. DuplicateCheckStep - Hash-based duplicate detection\n2. TextSemanticAnalyzeStep - Analyzes HOW text is structured (Textsemantik)\n3. TextSemanticStoreStep - Stores text semantics to chunk_semantics (unified)\n4. KnowledgeSemanticAnalyzeStep - Analyzes WHAT entities MEAN (Wissenssemantik)\n5. KnowledgeSemanticStoreStep - Stores knowledge semantics to entity_semantics (unified)\n6. OntologyStoreStep - Stores ontology classifications\n\nPart of Pipeline-Refactoring based on scientific specification.\nUpdated 2025-12-28: Unified schema - merged chunk_text_semantics into chunk_semantics,\n entity_knowledge_semantics into entity_semantics.\n\"\"\"\n\nimport json\n\nimport ollama\n\n\nclass DuplicateCheckStep:\n \"\"\"Step: Check for duplicate documents via content hash.\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, doc_id: int, content_hash: str) -> dict:\n \"\"\"\n Check if document with same hash already exists.\n\n Args:\n doc_id: Current document ID\n content_hash: SHA-256 hash of document content\n\n Returns:\n dict: {status: 'ok'|'abort', reason: str, duplicate_id: int|None}\n \"\"\"\n if self.progress:\n self.progress.update_step(\"duplicate_check\")\n self.progress.add_log(\"Prüfe auf Duplikate...\")\n\n if not content_hash:\n return {\"status\": \"skip\", \"reason\": \"no_hash\"}\n\n cursor = self.db.execute(\n \"\"\"SELECT id, source_path FROM documents\n WHERE file_hash = %s AND id != %s AND status = 'done'\n LIMIT 1\"\"\",",
"numLines": 50,
"startLine": 1,
"totalLines": 464
}
}
}