Protokoll #6245

ID6245
Zeitstempel2025-12-23 05:14:14.945135
Clientroot
IP145.224.96.190
Modellclaude-sonnet-4-20250514
Statuscompleted
Tokens1,345 (Input: 41, Output: 1,304)
Dauer356 ms
Request-Zeit2025-12-23 05:14:14.945135
Response-Zeit2025-12-23 05:14:15.301589

Request

{
    "event": "PreToolUse",
    "tool_name": "Grep",
    "tool_input": {
        "pattern": "def extract",
        "path": "\/opt\/scripts\/pipeline\/extract.py",
        "output_mode": "content",
        "-A": 30
    }
}

Response

{
    "tool_response": {
        "mode": "content",
        "numFiles": 0,
        "filenames": [],
        "content": "13:def extract_pdf(file_path):\n14-    \"\"\"Extract text from PDF using PyMuPDF with optional OCR.\"\"\"\n15-    import fitz  # PyMuPDF\n16-\n17-    doc = fitz.open(file_path)\n18-    pages = []\n19-\n20-    for page_num, page in enumerate(doc):\n21-        text = page.get_text()\n22-\n23-        # If page has little text and OCR is enabled, try OCR\n24-        if OCR_ENABLED and len(text.strip()) < 50:\n25-            # Get page as image and OCR it\n26-            pix = page.get_pixmap(dpi=300)\n27-            img_data = pix.tobytes(\"png\")\n28-\n29-            try:\n30-                import io\n31-\n32-                import pytesseract\n33-                from PIL import Image\n34-\n35-                img = Image.open(io.BytesIO(img_data))\n36-                ocr_text = pytesseract.image_to_string(img, lang=OCR_LANGUAGE)\n37-                if len(ocr_text.strip()) > len(text.strip()):\n38-                    text = ocr_text\n39-            except Exception as e:\n40-                db.log(\"WARNING\", f\"OCR failed for page {page_num}: {e}\")\n41-\n42-        pages.append({\"page\": page_num + 1, \"text\": text.strip(), \"images\": len(page.get_images())})\n43-\n--\n48:def extract_pptx(file_path):\n49-    \"\"\"Extract text from PowerPoint including speaker notes.\"\"\"\n50-    from pptx import Presentation\n51-\n52-    prs = Presentation(file_path)\n53-    slides = []\n54-\n55-    for slide_num, slide in enumerate(prs.slides):\n56-        text_parts = []\n57-\n58-        # Extract text from shapes\n59-        for shape in slide.shapes:\n60-            if hasattr(shape, \"text\") and shape.text:\n61-                text_parts.append(shape.text)\n62-\n63-        # Extract speaker notes\n64-        notes = \"\"\n65-        if slide.has_notes_slide:\n66-            notes_frame = slide.notes_slide.notes_text_frame\n67-            if notes_frame:\n68-                notes = notes_frame.text\n69-\n70-        slides.append({\"slide\": slide_num + 1, \"text\": \"\\n\".join(text_parts), \"notes\": notes})\n71-\n72-    return slides\n73-\n74-\n75:def extract_docx(file_path):\n76-    \"\"\"Extract text from Word document.\"\"\"\n77-    from docx import Document\n78-\n79-    doc = Document(file_path)\n80-    paragraphs = []\n81-\n82-    for para in doc.paragraphs:\n83-        if para.text.strip():\n84-            style = para.style.name if para.style else \"Normal\"\n85-            paragraphs.append({\"text\": para.text, \"style\": style, \"is_heading\": style.startswith(\"Heading\")})\n86-\n87-    return paragraphs\n88-\n89-\n90:def extract_markdown(file_path):\n91-    \"\"\"Extract text from Markdown, preserving structure.\"\"\"\n92-    import frontmatter\n93-\n94-    with open(file_path, encoding=\"utf-8\") as f:\n95-        post = frontmatter.load(f)\n96-\n97-    content = post.content\n98-    metadata = dict(post.metadata)\n99-\n100-    # Parse headings for structure\n101-    sections = []\n102-    current_section = {\"heading\": None, \"level\": 0, \"content\": []}\n103-\n104-    for line in content.split(\"\\n\"):\n105-        heading_match = re.match(r\"^(#{1,6})\\s+(.+)$\", line)\n106-        if heading_match:\n107-            if current_section[\"content\"] or current_section[\"heading\"]:\n108-                sections.append(current_section)\n109-            current_section = {\"heading\": heading_match.group(2), \"level\": len(heading_match.group(1)), \"content\": []}\n110-        else:\n111-            current_section[\"content\"].append(line)\n112-\n113-    if current_section[\"content\"] or current_section[\"heading\"]:\n114-        sections.append(current_section)\n115-\n116-    return {\"metadata\": metadata, \"sections\": sections}\n117-\n118-\n119:def extract_txt(file_path):\n120-    \"\"\"Extract text from plain text file.\"\"\"\n121-    with open(file_path, encoding=\"utf-8\") as f:\n122-        content = f.read()\n123-    return {\"text\": content}\n124-\n125-\n126:def extract(file_path):\n127-    \"\"\"\n128-    Main extraction function.\n129-    Returns extracted content with structure info.\n130-    \"\"\"\n131-    ext = Path(file_path).suffix.lower()\n132-\n133-    extractors = {\n134-        \".pdf\": extract_pdf,\n135-        \".pptx\": extract_pptx,\n136-        \".docx\": extract_docx,\n137-        \".md\": extract_markdown,\n138-        \".txt\": extract_txt,\n139-    }\n140-\n141-    if ext not in extractors:\n142-        raise ValueError(f\"Unsupported file type: {ext}\")\n143-\n144-    db.log(\"INFO\", f\"Extracting: {file_path}\", f\"type={ext}\")\n145-\n146-    try:\n147-        content = extractors[ext](file_path)\n148-        return {\"file_path\": file_path, \"file_type\": ext, \"content\": content, \"success\": True}\n149-    except Exception as e:\n150-        db.log(\"ERROR\", f\"Extraction failed: {file_path}\", str(e))\n151-        return {\"file_path\": file_path, \"file_type\": ext, \"error\": str(e), \"success\": False}\n152-\n153-\n154-def get_full_text(extraction_result):\n155-    \"\"\"Get plain text from extraction result.\"\"\"\n156-    if not extraction_result[\"success\"]:",
        "numLines": 141
    }
}
← Vorheriger Zur Liste Nächster →