Protokoll #4040
| ID | 4040 |
|---|---|
| Zeitstempel | 2025-12-22 09:04:29.579759 |
| Client | root |
| IP | Warning: Undefined array key "request_ip" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 Deprecated: htmlspecialchars(): Passing null to parameter #1 ($string) of type string is deprecated in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 |
| Modell | claude-sonnet-4-20250514 |
| Status | pending |
| Tokens | 2,797 (Input: 0, Output: 0) |
| Dauer | - |
| Request-Zeit | Warning: Undefined array key "request_timestamp" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 18 |
| Response-Zeit | - |
Warning: Undefined array key "error_message" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 22
Request
{
"event": "PostToolUse",
"raw_data": {
"session_id": "bf8a0fc3-34fa-408b-8067-b725d1fe9b14",
"transcript_path": "\/root\/.claude\/projects\/-var-www\/bf8a0fc3-34fa-408b-8067-b725d1fe9b14.jsonl",
"cwd": "\/var\/www\/dev.campus.systemische-tools.de",
"permission_mode": "acceptEdits",
"hook_event_name": "PostToolUse",
"tool_name": "Read",
"tool_input": {
"file_path": "\/opt\/scripts\/pipeline\/chunk.py"
},
"tool_response": {
"type": "text",
"file": {
"filePath": "\/opt\/scripts\/pipeline\/chunk.py",
"content": "\"\"\"\nSemantic chunking for KI-System Pipeline\nSplits documents into meaningful chunks preserving hierarchy.\n\"\"\"\n\nimport re\n\nfrom config import CHUNK_OVERLAP_PERCENT, MAX_CHUNK_SIZE, MIN_CHUNK_SIZE\n\n\ndef split_into_sentences(text):\n \"\"\"Split text into sentences.\"\"\"\n # German-aware sentence splitting\n pattern = r\"(?<=[.!?])\\s+(?=[A-ZÄÖÜ])\"\n sentences = re.split(pattern, text)\n return [s.strip() for s in sentences if s.strip()]\n\n\ndef calculate_overlap(chunk_size):\n \"\"\"Calculate overlap size based on chunk size.\"\"\"\n return int(chunk_size * CHUNK_OVERLAP_PERCENT \/ 100)\n\n\ndef chunk_by_structure(extraction_result):\n \"\"\"\n Chunk document based on its structure.\n Preserves heading hierarchy in metadata.\n \"\"\"\n chunks = []\n file_type = extraction_result[\"file_type\"]\n content = extraction_result[\"content\"]\n\n if file_type == \".pdf\":\n chunks = chunk_pdf(content)\n elif file_type == \".pptx\":\n chunks = chunk_pptx(content)\n elif file_type == \".docx\":\n chunks = chunk_docx(content)\n elif file_type == \".md\":\n chunks = chunk_markdown(content)\n elif file_type == \".txt\":\n chunks = chunk_text(content[\"text\"])\n\n return chunks\n\n\ndef chunk_pdf(pages):\n \"\"\"Chunk PDF by pages and paragraphs.\"\"\"\n chunks = []\n position = 0\n\n for page in pages:\n if not page[\"text\"]:\n continue\n\n # Split page into paragraphs\n paragraphs = page[\"text\"].split(\"\\n\\n\")\n\n current_chunk = []\n current_size = 0\n\n for para in paragraphs:\n para = para.strip()\n if not para:\n continue\n\n para_size = len(para)\n\n # If paragraph alone exceeds max, split it\n if para_size > MAX_CHUNK_SIZE:\n # Flush current chunk\n if current_chunk:\n chunk_text = \"\\n\\n\".join(current_chunk)\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": [f\"Seite {page['page']}\"],\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {\"page\": page[\"page\"]},\n }\n )\n position += len(chunk_text)\n current_chunk = []\n current_size = 0\n\n # Split large paragraph by sentences\n sentences = split_into_sentences(para)\n sentence_chunk = []\n sentence_size = 0\n\n for sentence in sentences:\n if sentence_size + len(sentence) > MAX_CHUNK_SIZE:\n chunk_text = \" \".join(sentence_chunk)\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": [f\"Seite {page['page']}\"],\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {\"page\": page[\"page\"]},\n }\n )\n position += len(chunk_text)\n # Keep overlap\n overlap_count = max(1, len(sentence_chunk) \/\/ 10)\n sentence_chunk = sentence_chunk[-overlap_count:]\n sentence_size = sum(len(s) for s in sentence_chunk)\n\n sentence_chunk.append(sentence)\n sentence_size += len(sentence)\n\n if sentence_chunk:\n current_chunk = [\" \".join(sentence_chunk)]\n current_size = sentence_size\n\n elif current_size + para_size > MAX_CHUNK_SIZE:\n # Flush current chunk\n chunk_text = \"\\n\\n\".join(current_chunk)\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": [f\"Seite {page['page']}\"],\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {\"page\": page[\"page\"]},\n }\n )\n position += len(chunk_text)\n\n # Start new chunk with overlap\n overlap = calculate_overlap(len(chunk_text))\n if overlap > 0 and current_chunk:\n overlap_text = current_chunk[-1][-overlap:]\n current_chunk = [overlap_text, para]\n current_size = len(overlap_text) + para_size\n else:\n current_chunk = [para]\n current_size = para_size\n else:\n current_chunk.append(para)\n current_size += para_size\n\n # Flush remaining\n if current_chunk:\n chunk_text = \"\\n\\n\".join(current_chunk)\n if len(chunk_text) >= MIN_CHUNK_SIZE:\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": [f\"Seite {page['page']}\"],\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {\"page\": page[\"page\"]},\n }\n )\n position += len(chunk_text)\n\n return chunks\n\n\ndef chunk_pptx(slides):\n \"\"\"Chunk PowerPoint by slides.\"\"\"\n chunks = []\n position = 0\n\n for slide in slides:\n content_parts = []\n if slide[\"text\"]:\n content_parts.append(slide[\"text\"])\n if slide[\"notes\"]:\n content_parts.append(f\"\\n[Notizen: {slide['notes']}]\")\n\n if content_parts:\n chunk_text = \"\\n\".join(content_parts)\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": [f\"Folie {slide['slide']}\"],\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {\"slide\": slide[\"slide\"]},\n }\n )\n position += len(chunk_text)\n\n return chunks\n\n\ndef chunk_docx(paragraphs):\n \"\"\"Chunk Word document by headings.\"\"\"\n chunks = []\n position = 0\n current_headings = []\n current_chunk = []\n current_size = 0\n\n for para in paragraphs:\n if para[\"is_heading\"]:\n # Flush current chunk\n if current_chunk:\n chunk_text = \"\\n\\n\".join(current_chunk)\n if len(chunk_text) >= MIN_CHUNK_SIZE:\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": current_headings.copy(),\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {},\n }\n )\n position += len(chunk_text)\n current_chunk = []\n current_size = 0\n\n # Update heading path\n level = int(para[\"style\"].replace(\"Heading \", \"\")) if \"Heading \" in para[\"style\"] else 1\n while len(current_headings) >= level:\n current_headings.pop()\n current_headings.append(para[\"text\"])\n\n else:\n para_size = len(para[\"text\"])\n if current_size + para_size > MAX_CHUNK_SIZE and current_chunk:\n # Flush\n chunk_text = \"\\n\\n\".join(current_chunk)\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": current_headings.copy(),\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {},\n }\n )\n position += len(chunk_text)\n current_chunk = []\n current_size = 0\n\n current_chunk.append(para[\"text\"])\n current_size += para_size\n\n # Flush remaining\n if current_chunk:\n chunk_text = \"\\n\\n\".join(current_chunk)\n if len(chunk_text) >= MIN_CHUNK_SIZE:\n chunks.append(\n {\n \"content\": chunk_text,\n \"heading_path\": current_headings.copy(),\n \"position_start\": position,\n \"position_end\": position + len(chunk_text),\n \"metadata\": {},\n }\n )\n\n return chunks\n\n\ndef chunk_markdown(content):\n \"\"\"Chunk Markdown by sections.\"\"\"\n chunks = []\n position = 0\n heading_stack = []\n\n for section in content[\"sections\"]:\n if section[\"heading\"]:\n level = section[\"level\"]\n while len(heading_stack) >= level:\n heading_stack.pop()\n heading_stack.append(section[\"heading\"])\n\n section_text = \"\\n\".join(section[\"content\"]).strip()\n if section_text and len(section_text) >= MIN_CHUNK_SIZE:\n # Large sections need splitting\n if len(section_text) > MAX_CHUNK_SIZE:\n sub_chunks = chunk_text(section_text)\n for sub in sub_chunks:\n sub[\"heading_path\"] = heading_stack.copy()\n sub[\"position_start\"] = position\n sub[\"position_end\"] = position + len(sub[\"content\"])\n position += len(sub[\"content\"])\n chunks.append(sub)\n else:\n chunks.append(\n {\n \"content\": section_text,\n \"heading_path\": heading_stack.co... [TRUNCATED-490213e87c9dc8f8]",
"numLines": 408,
"startLine": 1,
"totalLines": 408
}
},
"tool_use_id": "toolu_01XauPh9gPFoe9N23rFTkK7N"
}
}
Response
-