{
"event": "PreToolUse",
"tool_name": "Grep",
"tool_input": {
"pattern": "def extract",
"path": "\/opt\/scripts\/pipeline\/extract.py",
"output_mode": "content",
"-A": 30
}
}
{
"tool_response": {
"mode": "content",
"numFiles": 0,
"filenames": [],
"content": "13:def extract_pdf(file_path):\n14- \"\"\"Extract text from PDF using PyMuPDF with optional OCR.\"\"\"\n15- import fitz # PyMuPDF\n16-\n17- doc = fitz.open(file_path)\n18- pages = []\n19-\n20- for page_num, page in enumerate(doc):\n21- text = page.get_text()\n22-\n23- # If page has little text and OCR is enabled, try OCR\n24- if OCR_ENABLED and len(text.strip()) < 50:\n25- # Get page as image and OCR it\n26- pix = page.get_pixmap(dpi=300)\n27- img_data = pix.tobytes(\"png\")\n28-\n29- try:\n30- import io\n31-\n32- import pytesseract\n33- from PIL import Image\n34-\n35- img = Image.open(io.BytesIO(img_data))\n36- ocr_text = pytesseract.image_to_string(img, lang=OCR_LANGUAGE)\n37- if len(ocr_text.strip()) > len(text.strip()):\n38- text = ocr_text\n39- except Exception as e:\n40- db.log(\"WARNING\", f\"OCR failed for page {page_num}: {e}\")\n41-\n42- pages.append({\"page\": page_num + 1, \"text\": text.strip(), \"images\": len(page.get_images())})\n43-\n--\n48:def extract_pptx(file_path):\n49- \"\"\"Extract text from PowerPoint including speaker notes.\"\"\"\n50- from pptx import Presentation\n51-\n52- prs = Presentation(file_path)\n53- slides = []\n54-\n55- for slide_num, slide in enumerate(prs.slides):\n56- text_parts = []\n57-\n58- # Extract text from shapes\n59- for shape in slide.shapes:\n60- if hasattr(shape, \"text\") and shape.text:\n61- text_parts.append(shape.text)\n62-\n63- # Extract speaker notes\n64- notes = \"\"\n65- if slide.has_notes_slide:\n66- notes_frame = slide.notes_slide.notes_text_frame\n67- if notes_frame:\n68- notes = notes_frame.text\n69-\n70- slides.append({\"slide\": slide_num + 1, \"text\": \"\\n\".join(text_parts), \"notes\": notes})\n71-\n72- return slides\n73-\n74-\n75:def extract_docx(file_path):\n76- \"\"\"Extract text from Word document.\"\"\"\n77- from docx import Document\n78-\n79- doc = Document(file_path)\n80- paragraphs = []\n81-\n82- for para in doc.paragraphs:\n83- if para.text.strip():\n84- style = para.style.name if para.style else \"Normal\"\n85- paragraphs.append({\"text\": para.text, \"style\": style, \"is_heading\": style.startswith(\"Heading\")})\n86-\n87- return paragraphs\n88-\n89-\n90:def extract_markdown(file_path):\n91- \"\"\"Extract text from Markdown, preserving structure.\"\"\"\n92- import frontmatter\n93-\n94- with open(file_path, encoding=\"utf-8\") as f:\n95- post = frontmatter.load(f)\n96-\n97- content = post.content\n98- metadata = dict(post.metadata)\n99-\n100- # Parse headings for structure\n101- sections = []\n102- current_section = {\"heading\": None, \"level\": 0, \"content\": []}\n103-\n104- for line in content.split(\"\\n\"):\n105- heading_match = re.match(r\"^(#{1,6})\\s+(.+)$\", line)\n106- if heading_match:\n107- if current_section[\"content\"] or current_section[\"heading\"]:\n108- sections.append(current_section)\n109- current_section = {\"heading\": heading_match.group(2), \"level\": len(heading_match.group(1)), \"content\": []}\n110- else:\n111- current_section[\"content\"].append(line)\n112-\n113- if current_section[\"content\"] or current_section[\"heading\"]:\n114- sections.append(current_section)\n115-\n116- return {\"metadata\": metadata, \"sections\": sections}\n117-\n118-\n119:def extract_txt(file_path):\n120- \"\"\"Extract text from plain text file.\"\"\"\n121- with open(file_path, encoding=\"utf-8\") as f:\n122- content = f.read()\n123- return {\"text\": content}\n124-\n125-\n126:def extract(file_path):\n127- \"\"\"\n128- Main extraction function.\n129- Returns extracted content with structure info.\n130- \"\"\"\n131- ext = Path(file_path).suffix.lower()\n132-\n133- extractors = {\n134- \".pdf\": extract_pdf,\n135- \".pptx\": extract_pptx,\n136- \".docx\": extract_docx,\n137- \".md\": extract_markdown,\n138- \".txt\": extract_txt,\n139- }\n140-\n141- if ext not in extractors:\n142- raise ValueError(f\"Unsupported file type: {ext}\")\n143-\n144- db.log(\"INFO\", f\"Extracting: {file_path}\", f\"type={ext}\")\n145-\n146- try:\n147- content = extractors[ext](file_path)\n148- return {\"file_path\": file_path, \"file_type\": ext, \"content\": content, \"success\": True}\n149- except Exception as e:\n150- db.log(\"ERROR\", f\"Extraction failed: {file_path}\", str(e))\n151- return {\"file_path\": file_path, \"file_type\": ext, \"error\": str(e), \"success\": False}\n152-\n153-\n154-def get_full_text(extraction_result):\n155- \"\"\"Get plain text from extraction result.\"\"\"\n156- if not extraction_result[\"success\"]:",
"numLines": 141
}
}