{
"event": "PreToolUse",
"tool_name": "Read",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/extract.py",
"limit": 50
}
}
{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/extract.py",
"content": "\"\"\"\nText extraction for KI-System Pipeline\nExtracts text from PDF, PPTX, DOCX, MD, TXT files.\n\"\"\"\n\nimport re\nfrom pathlib import Path\n\nfrom config import OCR_ENABLED, OCR_LANGUAGE\nfrom db import db\n\n\ndef extract_pdf(file_path):\n \"\"\"Extract text from PDF using PyMuPDF with optional OCR.\"\"\"\n import fitz # PyMuPDF\n\n doc = fitz.open(file_path)\n pages = []\n\n for page_num, page in enumerate(doc):\n text = page.get_text()\n\n # If page has little text and OCR is enabled, try OCR\n if OCR_ENABLED and len(text.strip()) < 50:\n # Get page as image and OCR it\n pix = page.get_pixmap(dpi=300)\n img_data = pix.tobytes(\"png\")\n\n try:\n import io\n\n import pytesseract\n from PIL import Image\n\n img = Image.open(io.BytesIO(img_data))\n ocr_text = pytesseract.image_to_string(img, lang=OCR_LANGUAGE)\n if len(ocr_text.strip()) > len(text.strip()):\n text = ocr_text\n except Exception as e:\n db.log(\"WARNING\", f\"OCR failed for page {page_num}: {e}\")\n\n pages.append({\"page\": page_num + 1, \"text\": text.strip(), \"images\": len(page.get_images())})\n\n doc.close()\n return pages\n\n\ndef extract_pptx(file_path):\n \"\"\"Extract text from PowerPoint including speaker notes.\"\"\"\n from pptx import Presentation",
"numLines": 50,
"startLine": 1,
"totalLines": 200
}
}
}