{
"event": "PreToolUse",
"tool_name": "Read",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/step_extract.py"
}
}
{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/step_extract.py",
"content": "\"\"\"\nExtraction Step Module\nHandles document text extraction for all supported formats.\n\nPart of modularized pipeline architecture.\n\"\"\"\n\nimport hashlib\nimport os\nfrom pathlib import Path\n\nfrom extract import extract, get_full_text\n\n\nclass ExtractionStep:\n \"\"\"Step: Extract text from documents.\"\"\"\n\n def __init__(self, db, progress=None):\n \"\"\"\n Initialize extraction step.\n\n Args:\n db: Database instance\n progress: Optional PipelineProgress instance\n \"\"\"\n self.db = db\n self.progress = progress\n\n def execute(self, file_path):\n \"\"\"\n Extract text from document.\n\n Args:\n file_path: Path to document file\n\n Returns:\n dict: {\n 'success': bool,\n 'extraction': dict with content\/metadata,\n 'file_info': dict with hash\/size\/type,\n 'error': str (if failed)\n }\n \"\"\"\n file_name = Path(file_path).name\n\n if self.progress:\n self.progress.update_step(\"extract\")\n self.progress.add_log(f\"Extrahiere Text: {file_name}\")\n\n self.db.log(\"INFO\", f\"Extracting: {file_path}\")\n\n # Check cancellation\n if self.progress and self.progress.is_cancelled():\n return {\"success\": False, \"error\": \"cancelled\"}\n\n # Extract content\n extraction = extract(file_path)\n if not extraction[\"success\"]:\n error = extraction.get(\"error\", \"Unknown extraction error\")\n self.db.log(\"ERROR\", f\"Extraction failed: {error}\")\n if self.progress:\n self.progress.add_log(\"FEHLER: Extraktion fehlgeschlagen\")\n return {\"success\": False, \"error\": error}\n\n # Get file metadata\n file_stat = os.stat(file_path)\n with open(file_path, \"rb\") as f:\n file_hash = hashlib.sha256(f.read()).hexdigest()\n\n file_info = {\n \"path\": file_path,\n \"name\": file_name,\n \"type\": extraction[\"file_type\"],\n \"size\": file_stat.st_size,\n \"hash\": file_hash,\n }\n\n # Count pages for PDFs\n total_pages = 0\n if extraction[\"file_type\"] == \".pdf\" and isinstance(extraction.get(\"content\"), list):\n total_pages = len(extraction[\"content\"])\n if self.progress:\n self.progress.add_log(f\"{total_pages} Seiten extrahiert\")\n\n self.db.log(\"INFO\", f\"Extraction complete: {total_pages} pages\" if total_pages else \"Extraction complete\")\n\n return {\n \"success\": True,\n \"extraction\": extraction,\n \"file_info\": file_info,\n \"total_pages\": total_pages,\n }\n\n def get_full_text_from_extraction(self, extraction):\n \"\"\"\n Get full text from extraction result.\n\n Args:\n extraction: Extraction result dict\n\n Returns:\n str: Full document text\n \"\"\"\n return get_full_text(extraction)\n",
"numLines": 105,
"startLine": 1,
"totalLines": 105
}
}
}