{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/step_semantic_extended.py",
"content": "\"\"\"\nExtended Semantic Steps for Scientific Pipeline v1.\n\nImplements 6 new step_types for Pipeline #5:\n1. DuplicateCheckStep - Hash-based duplicate detection\n2. TextSemanticAnalyzeStep - Analyzes HOW text is structured (Textsemantik)\n3. TextSemanticStoreStep - Stores text semantics to chunk_text_semantics\n4. KnowledgeSemanticAnalyzeStep - Analyzes WHAT entities MEAN (Wissenssemantik)\n5. KnowledgeSemanticStoreStep - Stores knowledge semantics to entity_knowledge_semantics\n6. OntologyStoreStep - Stores ontology classifications\n\nPart of Pipeline-Refactoring based on scientific specification.\n\"\"\"\n\nimport json\n\nimport ollama\n\nfrom constants import LLM_TIMEOUT, MAX_RETRIES\n\n\nclass DuplicateCheckStep:\n \"\"\"Step: Check for duplicate documents via content hash.\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, doc_id: int, content_hash: str) -> dict:\n \"\"\"\n Check if document with same hash already exists.\n\n Args:\n doc_id: Current document ID\n content_hash: SHA-256 hash of document content\n\n Returns:\n dict: {status: 'ok'|'abort', reason: str, duplicate_id: int|None}\n \"\"\"\n if self.progress:\n self.progress.update_step(\"duplicate_check\")\n self.progress.add_log(\"Prüfe auf Duplikate...\")\n\n if not content_hash:\n return {\"status\": \"skip\", \"reason\": \"no_hash\"}\n\n cursor = self.db.execute(\n \"\"\"SELECT id, source_path FROM documents\n WHERE file_hash = %s AND id != %s AND status = 'done'\n LIMIT 1\"\"\",\n (content_hash, doc_id),\n )\n existing = cursor.fetchone()\n cursor.close()\n\n if existing:\n self.db.log(\n \"INFO\",\n f\"Duplicate found: doc {doc_id} matches {existing['id']} ({existing['source_path']})\",\n )\n if self.progress:\n self.progress.add_log(f\"Duplikat gefunden: ID {existing['id']}\")\n\n return {\n \"status\": \"abort\",\n \"reason\": \"duplicate\",\n \"duplicate_id\": existing[\"id\"],\n \"duplicate_path\": existing[\"source_path\"],\n }\n\n if self.progress:\n self.progress.add_log(\"Kein Duplikat gefunden\")\n\n return {\"status\": \"ok\"}\n\n\nclass TextSemanticAnalyzeStep:\n \"\"\"Step: Analyze HOW text is structured (Textsemantik).\n\n Analyzes each chunk for:\n - statement_form: assertion, question, command, conditional\n - intent: explain, argue, define, compare, exemplify, warn, instruct\n - frame: theoretical, practical, historical, methodological, critical\n - is_negated: whether the statement is negated\n - discourse_role: thesis, evidence, example, counter, summary, definition\n \"\"\"\n\n PROMPT_TEMPLATE = \"\"\"Analysiere den folgenden Text semantisch.\n\nBestimme:\n1. statement_form: Ist es eine Aussage (assertion), Frage (question), Aufforderung (command) oder Bedingung (conditional)?\n2. intent: Was ist die Absicht? explain, argue, define, compare, exemplify, warn, instruct\n3. frame: Welcher Rahmen? theoretical, practical, historical, methodological, critical\n4. is_negated: Wird etwas verneint? true\/false\n5. discourse_role: Welche Rolle im Diskurs? thesis, evidence, example, counter, summary, definition\n\nAntworte NUR mit gültigem JSON:\n{{\n \"statement_form\": \"assertion|question|command|conditional\",\n \"intent\": \"explain|argue|define|compare|exemplify|warn|instruct\",\n \"frame\": \"theoretical|practical|historical|methodological|critical\",\n \"is_negated\": false,\n \"discourse_role\": \"thesis|evidence|example|counter|summary|definition\"\n}}\n\nText:\n{content}\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, chunks: list, config: dict) -> list:\n \"\"\"\n Analyze text semantics for each chunk.\n\n Args:\n chunks: List of chunk dicts with 'id' and 'content'\n config: Step config with 'model' (default: mistral)\n\n Returns:\n list: Chunks with added 'text_semantics' field\n \"\"\"\n if self.progress:\n self.progress.update_step(\"text_semantic_analyze\")\n self.progress.add_log(f\"Textsemantik-Analyse für {len(chunks)} Chunks...\")\n\n model = config.get(\"model\", \"mistral\")\n analyzed = 0\n errors = 0\n\n for chunk in chunks:\n try:\n prompt = self.PROMPT_TEMPLATE.format(content=chunk[\"content\"][:2000])\n\n response = ollama.generate(\n model=model,\n prompt=prompt,\n options={\"num_predict\": 200},\n )\n\n # Parse JSON response\n response_text = response[\"response\"].strip()\n # Find JSON in response\n start = response_text.find(\"{\")\n end = response_text.rfind(\"}\") + 1\n if start >= 0 and end > start:\n json_str = response_text[start:end]\n chunk[\"text_semantics\"] = json.loads(json_str)\n chunk[\"text_semantics\"][\"model_used\"] = model\n analyzed += 1\n else:\n chunk[\"text_semantics\"] = None\n errors += 1\n\n except Exception as e:\n self.db.log(\"WARNING\", f\"Text semantic analysis failed for chunk {chunk['id']}: {e}\")\n chunk[\"text_semantics\"] = None\n errors += 1\n\n if self.progress:\n self.progress.add_log(f\"Textsemantik: {analyzed} analysiert, {errors} Fehler\")\n\n self.db.log(\"INFO\", f\"Text semantic analysis: {analyzed} chunks, {errors} errors\")\n return chunks\n\n\nclass TextSemanticStoreStep:\n \"\"\"Step: Store text semantics to database.\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, chunks: list, config: dict) -> dict:\n \"\"\"\n Store text semantics from chunks to chunk_text_semantics table.\n\n Args:\n chunks: List of chunks with 'text_semantics' field\n config: Step config\n\n Returns:\n dict: {stored: int, skipped: int}\n \"\"\"\n if self.progress:\n self.progress.update_step(\"text_semantic_store\")\n self.progress.add_log(\"Speichere Textsemantik...\")\n\n stored = 0\n skipped = 0\n\n for chunk in chunks:\n if not chunk.get(\"text_semantics\"):\n skipped += 1\n continue\n\n sem = chunk[\"text_semantics\"]\n\n try:\n cursor = self.db.execute(\n \"\"\"INSERT INTO chunk_text_semantics\n (chunk_id, statement_form, intent, frame, is_negated,\n discourse_role, model_used)\n VALUES (%s, %s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n statement_form = VALUES(statement_form),\n intent = VALUES(intent),\n frame = VALUES(frame),\n is_negated = VALUES(is_negated),\n discourse_role = VALUES(discourse_role),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n chunk[\"id\"],\n sem.get(\"statement_form\"),\n sem.get(\"intent\"),\n sem.get(\"frame\"),\n sem.get(\"is_negated\", False),\n sem.get(\"discourse_role\"),\n sem.get(\"model_used\"),\n ),\n )\n self.db.commit()\n cursor.close()\n stored += 1\n\n except Exception as e:\n self.db.log(\"ERROR\", f\"Failed to store text semantics for chunk {chunk['id']}: {e}\")\n skipped += 1\n\n if self.progress:\n self.progress.add_log(f\"Textsemantik gespeichert: {stored}\")\n\n self.db.log(\"INFO\", f\"Text semantics stored: {stored}, skipped: {skipped}\")\n return {\"stored\": stored, \"skipped\": skipped}\n\n\nclass KnowledgeSemanticAnalyzeStep:\n \"\"\"Step: Analyze WHAT entities MEAN in context (Wissenssemantik).\n\n Analyzes each entity for:\n - semantic_role: agent, patient, instrument, location, cause, effect\n - properties: JSON object with entity properties\n - functional_category: method, tool, concept, actor, outcome, process\n - context_meaning: Brief explanation of entity's meaning in context\n \"\"\"\n\n PROMPT_TEMPLATE = \"\"\"Analysiere die Bedeutung dieser Entität im Kontext.\n\nEntität: {name}\nTyp: {entity_type}\nKontext: {context}\n\nBestimme:\n1. semantic_role: Welche Rolle spielt die Entität? agent, patient, instrument, location, cause, effect\n2. properties: Welche Eigenschaften hat sie? (als JSON-Objekt)\n3. functional_category: Welche Funktion? method, tool, concept, actor, outcome, process\n4. context_meaning: Was bedeutet die Entität in diesem Kontext? (1 Satz)\n\nAntworte NUR mit gültigem JSON:\n{{\n \"semantic_role\": \"agent|patient|instrument|location|cause|effect\",\n \"properties\": {{\"key\": \"value\"}},\n \"functional_category\": \"method|tool|concept|actor|outcome|process\",\n \"context_meaning\": \"Kurze Erklärung\"\n}}\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, entities: list, config: dict) -> list:\n \"\"\"\n Analyze knowledge semantics for each entity.\n\n Args:\n entities: List of entity dicts with 'id', 'name', 'type', 'context'\n config: Step config with 'model' (default: mistral)\n\n Returns:\n list: Entities with added 'knowledge_semantics' field\n \"\"\"\n if self.progress:\n self.progress.update_step(\"k... [TRUNCATED-dea7bd333c7a02b9]"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/step_semantic_extended.py",
"content": "\"\"\"\nExtended Semantic Steps for Scientific Pipeline v1.\n\nImplements 6 new step_types for Pipeline #5:\n1. DuplicateCheckStep - Hash-based duplicate detection\n2. TextSemanticAnalyzeStep - Analyzes HOW text is structured (Textsemantik)\n3. TextSemanticStoreStep - Stores text semantics to chunk_text_semantics\n4. KnowledgeSemanticAnalyzeStep - Analyzes WHAT entities MEAN (Wissenssemantik)\n5. KnowledgeSemanticStoreStep - Stores knowledge semantics to entity_knowledge_semantics\n6. OntologyStoreStep - Stores ontology classifications\n\nPart of Pipeline-Refactoring based on scientific specification.\n\"\"\"\n\nimport json\n\nimport ollama\n\nfrom constants import LLM_TIMEOUT, MAX_RETRIES\n\n\nclass DuplicateCheckStep:\n \"\"\"Step: Check for duplicate documents via content hash.\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, doc_id: int, content_hash: str) -> dict:\n \"\"\"\n Check if document with same hash already exists.\n\n Args:\n doc_id: Current document ID\n content_hash: SHA-256 hash of document content\n\n Returns:\n dict: {status: 'ok'|'abort', reason: str, duplicate_id: int|None}\n \"\"\"\n if self.progress:\n self.progress.update_step(\"duplicate_check\")\n self.progress.add_log(\"Prüfe auf Duplikate...\")\n\n if not content_hash:\n return {\"status\": \"skip\", \"reason\": \"no_hash\"}\n\n cursor = self.db.execute(\n \"\"\"SELECT id, source_path FROM documents\n WHERE file_hash = %s AND id != %s AND status = 'done'\n LIMIT 1\"\"\",\n (content_hash, doc_id),\n )\n existing = cursor.fetchone()\n cursor.close()\n\n if existing:\n self.db.log(\n \"INFO\",\n f\"Duplicate found: doc {doc_id} matches {existing['id']} ({existing['source_path']})\",\n )\n if self.progress:\n self.progress.add_log(f\"Duplikat gefunden: ID {existing['id']}\")\n\n return {\n \"status\": \"abort\",\n \"reason\": \"duplicate\",\n \"duplicate_id\": existing[\"id\"],\n \"duplicate_path\": existing[\"source_path\"],\n }\n\n if self.progress:\n self.progress.add_log(\"Kein Duplikat gefunden\")\n\n return {\"status\": \"ok\"}\n\n\nclass TextSemanticAnalyzeStep:\n \"\"\"Step: Analyze HOW text is structured (Textsemantik).\n\n Analyzes each chunk for:\n - statement_form: assertion, question, command, conditional\n - intent: explain, argue, define, compare, exemplify, warn, instruct\n - frame: theoretical, practical, historical, methodological, critical\n - is_negated: whether the statement is negated\n - discourse_role: thesis, evidence, example, counter, summary, definition\n \"\"\"\n\n PROMPT_TEMPLATE = \"\"\"Analysiere den folgenden Text semantisch.\n\nBestimme:\n1. statement_form: Ist es eine Aussage (assertion), Frage (question), Aufforderung (command) oder Bedingung (conditional)?\n2. intent: Was ist die Absicht? explain, argue, define, compare, exemplify, warn, instruct\n3. frame: Welcher Rahmen? theoretical, practical, historical, methodological, critical\n4. is_negated: Wird etwas verneint? true\/false\n5. discourse_role: Welche Rolle im Diskurs? thesis, evidence, example, counter, summary, definition\n\nAntworte NUR mit gültigem JSON:\n{{\n \"statement_form\": \"assertion|question|command|conditional\",\n \"intent\": \"explain|argue|define|compare|exemplify|warn|instruct\",\n \"frame\": \"theoretical|practical|historical|methodological|critical\",\n \"is_negated\": false,\n \"discourse_role\": \"thesis|evidence|example|counter|summary|definition\"\n}}\n\nText:\n{content}\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, chunks: list, config: dict) -> list:\n \"\"\"\n Analyze text semantics for each chunk.\n\n Args:\n chunks: List of chunk dicts with 'id' and 'content'\n config: Step config with 'model' (default: mistral)\n\n Returns:\n list: Chunks with added 'text_semantics' field\n \"\"\"\n if self.progress:\n self.progress.update_step(\"text_semantic_analyze\")\n self.progress.add_log(f\"Textsemantik-Analyse für {len(chunks)} Chunks...\")\n\n model = config.get(\"model\", \"mistral\")\n analyzed = 0\n errors = 0\n\n for chunk in chunks:\n try:\n prompt = self.PROMPT_TEMPLATE.format(content=chunk[\"content\"][:2000])\n\n response = ollama.generate(\n model=model,\n prompt=prompt,\n options={\"num_predict\": 200},\n )\n\n # Parse JSON response\n response_text = response[\"response\"].strip()\n # Find JSON in response\n start = response_text.find(\"{\")\n end = response_text.rfind(\"}\") + 1\n if start >= 0 and end > start:\n json_str = response_text[start:end]\n chunk[\"text_semantics\"] = json.loads(json_str)\n chunk[\"text_semantics\"][\"model_used\"] = model\n analyzed += 1\n else:\n chunk[\"text_semantics\"] = None\n errors += 1\n\n except Exception as e:\n self.db.log(\"WARNING\", f\"Text semantic analysis failed for chunk {chunk['id']}: {e}\")\n chunk[\"text_semantics\"] = None\n errors += 1\n\n if self.progress:\n self.progress.add_log(f\"Textsemantik: {analyzed} analysiert, {errors} Fehler\")\n\n self.db.log(\"INFO\", f\"Text semantic analysis: {analyzed} chunks, {errors} errors\")\n return chunks\n\n\nclass TextSemanticStoreStep:\n \"\"\"Step: Store text semantics to database.\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, chunks: list, config: dict) -> dict:\n \"\"\"\n Store text semantics from chunks to chunk_text_semantics table.\n\n Args:\n chunks: List of chunks with 'text_semantics' field\n config: Step config\n\n Returns:\n dict: {stored: int, skipped: int}\n \"\"\"\n if self.progress:\n self.progress.update_step(\"text_semantic_store\")\n self.progress.add_log(\"Speichere Textsemantik...\")\n\n stored = 0\n skipped = 0\n\n for chunk in chunks:\n if not chunk.get(\"text_semantics\"):\n skipped += 1\n continue\n\n sem = chunk[\"text_semantics\"]\n\n try:\n cursor = self.db.execute(\n \"\"\"INSERT INTO chunk_text_semantics\n (chunk_id, statement_form, intent, frame, is_negated,\n discourse_role, model_used)\n VALUES (%s, %s, %s, %s, %s, %s, %s)\n ON DUPLICATE KEY UPDATE\n statement_form = VALUES(statement_form),\n intent = VALUES(intent),\n frame = VALUES(frame),\n is_negated = VALUES(is_negated),\n discourse_role = VALUES(discourse_role),\n model_used = VALUES(model_used),\n updated_at = NOW()\"\"\",\n (\n chunk[\"id\"],\n sem.get(\"statement_form\"),\n sem.get(\"intent\"),\n sem.get(\"frame\"),\n sem.get(\"is_negated\", False),\n sem.get(\"discourse_role\"),\n sem.get(\"model_used\"),\n ),\n )\n self.db.commit()\n cursor.close()\n stored += 1\n\n except Exception as e:\n self.db.log(\"ERROR\", f\"Failed to store text semantics for chunk {chunk['id']}: {e}\")\n skipped += 1\n\n if self.progress:\n self.progress.add_log(f\"Textsemantik gespeichert: {stored}\")\n\n self.db.log(\"INFO\", f\"Text semantics stored: {stored}, skipped: {skipped}\")\n return {\"stored\": stored, \"skipped\": skipped}\n\n\nclass KnowledgeSemanticAnalyzeStep:\n \"\"\"Step: Analyze WHAT entities MEAN in context (Wissenssemantik).\n\n Analyzes each entity for:\n - semantic_role: agent, patient, instrument, location, cause, effect\n - properties: JSON object with entity properties\n - functional_category: method, tool, concept, actor, outcome, process\n - context_meaning: Brief explanation of entity's meaning in context\n \"\"\"\n\n PROMPT_TEMPLATE = \"\"\"Analysiere die Bedeutung dieser Entität im Kontext.\n\nEntität: {name}\nTyp: {entity_type}\nKontext: {context}\n\nBestimme:\n1. semantic_role: Welche Rolle spielt die Entität? agent, patient, instrument, location, cause, effect\n2. properties: Welche Eigenschaften hat sie? (als JSON-Objekt)\n3. functional_category: Welche Funktion? method, tool, concept, actor, outcome, process\n4. context_meaning: Was bedeutet die Entität in diesem Kontext? (1 Satz)\n\nAntworte NUR mit gültigem JSON:\n{{\n \"semantic_role\": \"agent|patient|instrument|location|cause|effect\",\n \"properties\": {{\"key\": \"value\"}},\n \"functional_category\": \"method|tool|concept|actor|outcome|process\",\n \"context_meaning\": \"Kurze Erklärung\"\n}}\"\"\"\n\n def __init__(self, db, progress=None):\n self.db = db\n self.progress = progress\n\n def execute(self, entities: list, config: dict) -> list:\n \"\"\"\n Analyze knowledge semantics for each entity.\n\n Args:\n entities: List of entity dicts with 'id', 'name', 'type', 'context'\n config: Step config with 'model' (default: mistral)\n\n Returns:\n list: Entities with added 'knowledge_semantics' field\n \"\"\"\n if self.progress:\n self.progress.update_step(\"k... [TRUNCATED-dea7bd333c7a02b9]",
"structuredPatch": [],
"originalFile": null
}
}