{
"event": "PostToolUse",
"tool_name": "Read",
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/step_transform.py",
"content": "\"\"\"\nTransformation Step Module\nHandles chunking and semantic analysis of extracted content.\n\nPart of modularized pipeline architecture.\n\"\"\"\n\nfrom analyze import analyze_document\nfrom chunk import chunk_by_structure\nfrom enrich import run_enrichment_step\nfrom vision import run_vision_step\n\n\nclass TransformationStep:\n \"\"\"Step: Transform extracted content into chunks and analyze.\"\"\"\n\n def __init__(self, db, progress=None):\n \"\"\"\n Initialize transformation step.\n\n Args:\n db: Database instance\n progress: Optional PipelineProgress instance\n \"\"\"\n self.db = db\n self.progress = progress\n\n def execute_vision(self, doc_id, file_path, file_type):\n \"\"\"\n Execute vision analysis for PDFs.\n\n Args:\n doc_id: Document database ID\n file_path: Path to file\n file_type: File extension\n\n Returns:\n dict: Vision analysis result\n \"\"\"\n if file_type != \".pdf\":\n return {\"success\": False, \"error\": \"Not a PDF\"}\n\n if self.progress:\n self.progress.update_step(\"vision\")\n self.progress.add_log(\"Vision-Analyse gestartet...\")\n\n self.db.log(\"INFO\", f\"Running vision analysis for document {doc_id}\")\n\n vision_config = {\n \"model\": \"llama3.2-vision:11b\",\n \"store_images\": True,\n \"detect_images\": True,\n \"detect_charts\": True,\n \"detect_tables\": True,\n }\n\n vision_result = run_vision_step(doc_id, file_path, vision_config, progress=self.progress)\n\n if vision_result[\"success\"]:\n self.db.log(\"INFO\", f\"Vision: {vision_result['pages_analyzed']}\/{vision_result['pages_total']} pages\")\n if self.progress:\n self.progress.add_log(f\"Vision: {vision_result['pages_analyzed']} Seiten analysiert\")\n else:\n self.db.log(\"WARNING\", f\"Vision analysis failed: {vision_result.get('error')}\")\n\n return vision_result\n\n def execute_chunking(self, extraction, total_pages=0):\n \"\"\"\n Chunk extracted content.\n\n Args:\n extraction: Extraction result dict\n total_pages: Number of pages (for logging)\n\n Returns:\n list: Chunk dictionaries\n \"\"\"\n if self.progress:\n self.progress.update_step(\"chunk\")\n if total_pages > 0:\n self.progress.add_log(f\"Erstelle Chunks aus {total_pages} Seiten...\")\n else:\n self.progress.add_log(\"Erstelle Chunks...\")\n\n chunks = chunk_by_structure(extraction)\n self.db.log(\"INFO\", f\"Created {len(chunks)} chunks\")\n\n if self.progress:\n msg = f\"{len(chunks)} Chunks erstellt\"\n if total_pages > 0:\n msg += f\" (aus {total_pages} Seiten)\"\n self.progress.add_log(msg)\n\n return chunks\n\n def execute_enrichment(self, doc_id, file_type):\n \"\"\"\n Enrich chunks with vision context (PDFs only).\n\n Args:\n doc_id: Document database ID\n file_type: File extension\n\n Returns:\n dict: Enrichment result\n \"\"\"\n if file_type != \".pdf\":\n return {\"success\": False, \"error\": \"Not a PDF\"}\n\n if self.progress:\n self.progress.update_step(\"enrich\")\n\n self.db.log(\"INFO\", f\"Running vision enrichment for document {doc_id}\")\n\n enrich_result = run_enrichment_step(doc_id)\n\n if enrich_result[\"success\"]:\n self.db.log(\"INFO\", f\"Enrichment: {enrich_result['enriched']}\/{enrich_result['total_chunks']} chunks\")\n else:\n self.db.log(\"WARNING\", f\"Enrichment failed: {enrich_result.get('error')}\")\n\n return enrich_result\n\n def execute_analysis(self, doc_id, full_text):\n \"\"\"\n Execute semantic analysis on document.\n\n Args:\n doc_id: Document database ID\n full_text: Full document text\n\n Returns:\n dict: Analysis result\n \"\"\"\n if self.progress:\n self.progress.update_step(\"analyze\")\n\n analysis = analyze_document(doc_id, full_text, progress=self.progress)\n self.db.log(\"INFO\", f\"Analysis complete: {analysis}\")\n\n return analysis\n",
"numLines": 143,
"startLine": 1,
"totalLines": 143
}
}
}