Protokoll #17791
| ID | 17791 |
|---|---|
| Zeitstempel | 2025-12-26 18:42:04.961492 |
| Client | root |
| IP | Warning: Undefined array key "request_ip" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 Deprecated: htmlspecialchars(): Passing null to parameter #1 ($string) of type string is deprecated in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 13 |
| Modell | claude-sonnet-4-20250514 |
| Status | completed |
| Tokens | 8,777 (Input: 0, Output: 0) |
| Dauer | 102 ms |
| Request-Zeit | Warning: Undefined array key "request_timestamp" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 18 |
| Response-Zeit | - |
Warning: Undefined array key "error_message" in /var/www/dev.campus.systemische-tools.de/src/View/protokoll/show.php on line 22
Request
{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/analyzers\/entity_extractor.py",
"old_string": "def extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using Ollama.\n\n Supports two response formats:\n 1. New format: {\"persons\":[], \"roles\":[], \"theories\":[], ...}\n 2. Legacy format: {\"entities\": [{\"name\": \"...\", \"type\": \"...\"}]}\n \"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{text}\"\"\"\n\n # Support both {text} and {{TEXT}} placeholders\n prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n\n response_text = data.get(\"response\", \"{}\")\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=data.get(\"prompt_eval_count\", 0),\n tokens_output=data.get(\"eval_count\", 0),\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n try:\n result = json.loads(response_text)\n return _normalize_entity_response(result, text)\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=f\"ollama:{model}\",\n status=\"error\",\n error_message=str(e),\n )\n return []",
"new_string": "def _call_ollama(prompt: str, model: str, timeout: int = 120) -> tuple[str, int, int, int]:\n \"\"\"Call Ollama API and return (response_text, tokens_in, tokens_out, duration_ms).\"\"\"\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=timeout,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n return (\n data.get(\"response\", \"{}\"),\n data.get(\"prompt_eval_count\", 0),\n data.get(\"eval_count\", 0),\n duration_ms,\n )\n\n\ndef extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using 2-pass approach for better categorization.\n\n Pass 1: Extract entity names from text\n Pass 2: Categorize extracted entities\n\n Falls back to single-pass if 2-pass prompts not available.\n \"\"\"\n # Try 2-pass approach first\n pass1_data = db.get_prompt_by_name(\"entity_extraction_pass1\")\n pass2_data = db.get_prompt_by_name(\"entity_extraction_pass2\")\n\n if pass1_data and pass2_data:\n return _extract_entities_2pass(text, pass1_data[\"content\"], pass2_data[\"content\"], model)\n\n # Fallback to single-pass\n return _extract_entities_single_pass(text, model)\n\n\ndef _extract_entities_2pass(\n text: str, pass1_template: str, pass2_template: str, model: str\n) -> list[dict]:\n \"\"\"2-pass entity extraction: extract then categorize.\"\"\"\n try:\n # PASS 1: Extract entity names\n prompt1 = pass1_template.replace(\"{text}\", text[:3000])\n resp1, tok_in1, tok_out1, dur1 = _call_ollama(prompt1, model)\n\n try:\n result1 = json.loads(resp1)\n raw_entities = result1.get(\"entities\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 1 JSON\")\n return []\n\n # Validate: only keep entities that appear in text\n valid_entities = [e for e in raw_entities if _validate_entity_in_text(e, text)]\n\n if not valid_entities:\n return []\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass1] {len(valid_entities)} entities\",\n response=json.dumps(valid_entities[:10], ensure_ascii=False),\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in1,\n tokens_output=tok_out1,\n duration_ms=dur1,\n status=\"completed\",\n )\n\n # PASS 2: Categorize entities\n entities_json = json.dumps(valid_entities, ensure_ascii=False)\n prompt2 = pass2_template.replace(\"{entities}\", entities_json)\n resp2, tok_in2, tok_out2, dur2 = _call_ollama(prompt2, model)\n\n try:\n result2 = json.loads(resp2)\n categorized = result2.get(\"kategorisiert\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 2 JSON\")\n # Fallback: return uncategorized entities\n return [{\"name\": e, \"type\": \"CONCEPT\", \"description\": None} for e in valid_entities]\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass2] categorize {len(valid_entities)} entities\",\n response=resp2[:1000],\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in2,\n tokens_output=tok_out2,\n duration_ms=dur2,\n status=\"completed\",\n )\n\n # Normalize output\n entities = []\n for e in categorized:\n if isinstance(e, dict) and \"name\" in e and \"type\" in e:\n # Final validation\n if _validate_entity_in_text(e[\"name\"], text):\n entities.append({\n \"name\": e[\"name\"],\n \"type\": e[\"type\"].upper(),\n \"description\": e.get(\"description\"),\n })\n\n return entities\n\n except Exception as e:\n db.log(\"ERROR\", f\"2-pass entity extraction failed: {e}\")\n return []\n\n\ndef _extract_entities_single_pass(text: str, model: str) -> list[dict]:\n \"\"\"Single-pass entity extraction (legacy fallback).\"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{text}\"\"\"\n\n prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])\n\n try:\n resp, tok_in, tok_out, dur = _call_ollama(prompt, model)\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=resp[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in,\n tokens_output=tok_out,\n duration_ms=dur,\n status=\"completed\",\n )\n\n try:\n result = json.loads(resp)\n return _normalize_entity_response(result, text)\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n return []"
}
}
Response
{
"tool_response": {
"filePath": "\/var\/www\/scripts\/pipeline\/analyzers\/entity_extractor.py",
"oldString": "def extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using Ollama.\n\n Supports two response formats:\n 1. New format: {\"persons\":[], \"roles\":[], \"theories\":[], ...}\n 2. Legacy format: {\"entities\": [{\"name\": \"...\", \"type\": \"...\"}]}\n \"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{text}\"\"\"\n\n # Support both {text} and {{TEXT}} placeholders\n prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n\n response_text = data.get(\"response\", \"{}\")\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=data.get(\"prompt_eval_count\", 0),\n tokens_output=data.get(\"eval_count\", 0),\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n try:\n result = json.loads(response_text)\n return _normalize_entity_response(result, text)\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=f\"ollama:{model}\",\n status=\"error\",\n error_message=str(e),\n )\n return []",
"newString": "def _call_ollama(prompt: str, model: str, timeout: int = 120) -> tuple[str, int, int, int]:\n \"\"\"Call Ollama API and return (response_text, tokens_in, tokens_out, duration_ms).\"\"\"\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=timeout,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n return (\n data.get(\"response\", \"{}\"),\n data.get(\"prompt_eval_count\", 0),\n data.get(\"eval_count\", 0),\n duration_ms,\n )\n\n\ndef extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using 2-pass approach for better categorization.\n\n Pass 1: Extract entity names from text\n Pass 2: Categorize extracted entities\n\n Falls back to single-pass if 2-pass prompts not available.\n \"\"\"\n # Try 2-pass approach first\n pass1_data = db.get_prompt_by_name(\"entity_extraction_pass1\")\n pass2_data = db.get_prompt_by_name(\"entity_extraction_pass2\")\n\n if pass1_data and pass2_data:\n return _extract_entities_2pass(text, pass1_data[\"content\"], pass2_data[\"content\"], model)\n\n # Fallback to single-pass\n return _extract_entities_single_pass(text, model)\n\n\ndef _extract_entities_2pass(\n text: str, pass1_template: str, pass2_template: str, model: str\n) -> list[dict]:\n \"\"\"2-pass entity extraction: extract then categorize.\"\"\"\n try:\n # PASS 1: Extract entity names\n prompt1 = pass1_template.replace(\"{text}\", text[:3000])\n resp1, tok_in1, tok_out1, dur1 = _call_ollama(prompt1, model)\n\n try:\n result1 = json.loads(resp1)\n raw_entities = result1.get(\"entities\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 1 JSON\")\n return []\n\n # Validate: only keep entities that appear in text\n valid_entities = [e for e in raw_entities if _validate_entity_in_text(e, text)]\n\n if not valid_entities:\n return []\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass1] {len(valid_entities)} entities\",\n response=json.dumps(valid_entities[:10], ensure_ascii=False),\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in1,\n tokens_output=tok_out1,\n duration_ms=dur1,\n status=\"completed\",\n )\n\n # PASS 2: Categorize entities\n entities_json = json.dumps(valid_entities, ensure_ascii=False)\n prompt2 = pass2_template.replace(\"{entities}\", entities_json)\n resp2, tok_in2, tok_out2, dur2 = _call_ollama(prompt2, model)\n\n try:\n result2 = json.loads(resp2)\n categorized = result2.get(\"kategorisiert\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 2 JSON\")\n # Fallback: return uncategorized entities\n return [{\"name\": e, \"type\": \"CONCEPT\", \"description\": None} for e in valid_entities]\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass2] categorize {len(valid_entities)} entities\",\n response=resp2[:1000],\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in2,\n tokens_output=tok_out2,\n duration_ms=dur2,\n status=\"completed\",\n )\n\n # Normalize output\n entities = []\n for e in categorized:\n if isinstance(e, dict) and \"name\" in e and \"type\" in e:\n # Final validation\n if _validate_entity_in_text(e[\"name\"], text):\n entities.append({\n \"name\": e[\"name\"],\n \"type\": e[\"type\"].upper(),\n \"description\": e.get(\"description\"),\n })\n\n return entities\n\n except Exception as e:\n db.log(\"ERROR\", f\"2-pass entity extraction failed: {e}\")\n return []\n\n\ndef _extract_entities_single_pass(text: str, model: str) -> list[dict]:\n \"\"\"Single-pass entity extraction (legacy fallback).\"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{text}\"\"\"\n\n prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])\n\n try:\n resp, tok_in, tok_out, dur = _call_ollama(prompt, model)\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=resp[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in,\n tokens_output=tok_out,\n duration_ms=dur,\n status=\"completed\",\n )\n\n try:\n result = json.loads(resp)\n return _normalize_entity_response(result, text)\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n return []",
"originalFile": "\"\"\"\nEntity Extraction - Extract and store entities from text.\n\"\"\"\n\nimport json\nimport re\nimport sys\nimport time\nimport unicodedata\n\nimport requests\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\nfrom config import ANTHROPIC_MODEL, OLLAMA_HOST\nfrom db import db\nfrom protokoll import protokoll\n\n\ndef normalize_name(name: str) -> str:\n \"\"\"Generate canonical_name from entity name.\n\n Rules:\n - Lowercase\n - German umlauts: ä→ae, ö→oe, ü→ue, ß→ss\n - Replace spaces with underscores\n - Remove special characters except underscores\n - Collapse multiple underscores\n \"\"\"\n if not name:\n return \"\"\n\n result = name.lower()\n\n replacements = {\n \"ä\": \"ae\", \"ö\": \"oe\", \"ü\": \"ue\", \"ß\": \"ss\",\n \"Ä\": \"ae\", \"Ö\": \"oe\", \"Ü\": \"ue\",\n }\n for old, new in replacements.items():\n result = result.replace(old, new)\n\n result = unicodedata.normalize(\"NFKD\", result)\n result = result.encode(\"ascii\", \"ignore\").decode(\"ascii\")\n result = re.sub(r\"[\\s\\-]+\", \"_\", result)\n result = re.sub(r\"[^a-z0-9_]\", \"\", result)\n result = re.sub(r\"_+\", \"_\", result)\n result = result.strip(\"_\")\n\n return result\n\n\n# Category to type mapping for new format\nCATEGORY_TYPE_MAP = {\n \"persons\": \"PERSON\",\n \"roles\": \"ROLE\",\n \"organizations\": \"ORGANIZATION\",\n \"theories\": \"THEORY\",\n \"models\": \"MODEL\",\n \"concepts\": \"CONCEPT\",\n \"artifacts\": \"ARTIFACT\",\n \"metaphors\": \"METAPHOR\",\n \"locations\": \"LOCATION\",\n}\n\n\ndef _validate_entity_in_text(entity_name: str, source_text: str) -> bool:\n \"\"\"Strictly validate that entity appears EXACTLY in source text.\"\"\"\n if not entity_name or len(entity_name) < 3:\n return False\n # Exact match required\n return entity_name in source_text\n\n\ndef _normalize_entity_response(result: dict, source_text: str) -> list[dict]:\n \"\"\"Normalize entity response to standard format with validation.\n\n Handles two formats:\n 1. New: {\"persons\":[], \"roles\":[], ...}\n 2. Legacy: {\"entities\": [...]}\n\n Also validates entities against source text to filter hallucinations.\n \"\"\"\n entities = []\n\n # Check for legacy format\n if \"entities\" in result:\n legacy_entities = result.get(\"entities\", [])\n # Validate legacy entities too\n for e in legacy_entities:\n if isinstance(e, dict) and \"name\" in e:\n if _validate_entity_in_text(e[\"name\"], source_text):\n entities.append(e)\n return entities\n\n # New categorized format\n for category, items in result.items():\n if not isinstance(items, list):\n continue\n\n entity_type = CATEGORY_TYPE_MAP.get(category, category.upper())\n\n for item in items:\n if not item or not isinstance(item, str):\n continue\n\n # Strict validation: entity must appear EXACTLY in source text\n if not _validate_entity_in_text(item, source_text):\n continue # Skip hallucinations\n\n entities.append({\n \"name\": item,\n \"type\": entity_type,\n \"description\": None,\n })\n\n return entities\n\n\ndef extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using Ollama.\n\n Supports two response formats:\n 1. New format: {\"persons\":[], \"roles\":[], \"theories\":[], ...}\n 2. Legacy format: {\"entities\": [{\"name\": \"...\", \"type\": \"...\"}]}\n \"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{text}\"\"\"\n\n # Support both {text} and {{TEXT}} placeholders\n prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n\n response_text = data.get(\"response\", \"{}\")\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=data.get(\"prompt_eval_count\", 0),\n tokens_output=data.get(\"eval_count\", 0),\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n try:\n result = json.loads(response_text)\n return _normalize_entity_response(result, text)\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=f\"ollama:{model}\",\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef extract_entities_anthropic(text: str, client) -> list[dict]:\n \"\"\"Extract entities using Anthropic Claude.\"\"\"\n prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")\n prompt_template = prompt_data[\"content\"] if prompt_data else None\n\n if not prompt_template:\n prompt_template = \"\"\"Analysiere den folgenden deutschen Text und extrahiere alle wichtigen Entitäten.\n\nKategorisiere jede Entität als:\n- PERSON (Namen von Personen)\n- ORGANIZATION (Firmen, Institutionen, Gruppen)\n- CONCEPT (Fachbegriffe, Methoden, Theorien)\n- LOCATION (Orte, Länder)\n- DATE (Zeitangaben)\n- OTHER (Sonstiges)\n\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"context\": \"kurzer Kontext der Erwähnung\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{TEXT}}\", text[:4000])\n\n try:\n start_time = time.time()\n message = client.messages.create(\n model=ANTHROPIC_MODEL, max_tokens=2000, messages=[{\"role\": \"user\", \"content\": prompt}]\n )\n duration_ms = int((time.time() - start_time) * 1000)\n\n response_text = message.content[0].text\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=ANTHROPIC_MODEL,\n tokens_input=message.usage.input_tokens,\n tokens_output=message.usage.output_tokens,\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n if json_match:\n entities = json.loads(json_match.group())\n return entities.get(\"entities\", [])\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Anthropic entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=ANTHROPIC_MODEL,\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef store_entities(document_id: int, entities: list[dict]) -> int:\n \"\"\"Store extracted entities in database with deduplication via canonical_name.\"\"\"\n stored = 0\n\n for entity in entities:\n try:\n description = entity.get(\"description\") or entity.get(\"context\") or None\n canonical = normalize_name(entity[\"name\"])\n\n # Check for existing entity by canonical_name first (deduplication)\n cursor = db.execute(\n \"SELECT id, description FROM entities WHERE canonical_name = %s AND type = %s\",\n (canonical, entity[\"type\"]),\n )\n existing = cursor.fetchone()\n cursor.close()\n\n # Fallback: check by exact name\n if not existing:\n cursor = db.execute(\n \"SELECT id, description FROM entities WHERE name = %s AND type = %s\",\n (entity[\"name\"], entity[\"type\"]),\n )\n existing = cursor.fetchone()\n cursor.close()\n\n if existing:\n entity_id = existing[\"id\"]\n # Update description and canonical_name if missing\n if description and not existing[\"description\"]:\n cursor = db.execute(\n \"UPDATE entities SET description = %s, canonical_name = %s WHERE id = %s\",\n (description, canonical, entity_id),\n )\n db.commit()\n cursor.close()\n else:\n # Ensure canonical_name is set\n cursor = db.execute(\n \"UPDATE entities SET canonical_name = %s WHERE id = %s AND canonical_name IS NULL\",\n (canonical, entity_id),\n )\n db.commit()\n cursor.close()\n else:\n # Insert new entity with canonical_name and status='normalized'\n cursor = db.execute(\n \"\"\"INSERT INTO entities (name, type, description, canonical_name, status, created_at)\n VALUES (%s, %s, %s, %s, 'normalized', NOW())\"\"\",\n (entity[\"name\"], entity[\"type\"], description, canonical),\n )\n db.commit()\n entity_id = cursor.lastrowid\n cursor.close()\n\n db.log_provenance(\n artifact_type=\"entity\",\n artifact_id=ent... [TRUNCATED-b92205cd77cf5a01]",
"structuredPatch": [
{
"oldStart": 116,
"oldLines": 13,
"newStart": 116,
"newLines": 120,
"lines": [
" return entities",
" ",
" ",
"+def _call_ollama(prompt: str, model: str, timeout: int = 120) -> tuple[str, int, int, int]:",
"+ \"\"\"Call Ollama API and return (response_text, tokens_in, tokens_out, duration_ms).\"\"\"",
"+ start_time = time.time()",
"+ response = requests.post(",
"+ f\"{OLLAMA_HOST}\/api\/generate\",",
"+ json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},",
"+ timeout=timeout,",
"+ )",
"+ response.raise_for_status()",
"+ data = response.json()",
"+ duration_ms = int((time.time() - start_time) * 1000)",
"+ return (",
"+ data.get(\"response\", \"{}\"),",
"+ data.get(\"prompt_eval_count\", 0),",
"+ data.get(\"eval_count\", 0),",
"+ duration_ms,",
"+ )",
"+",
"+",
" def extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:",
"- \"\"\"Extract entities using Ollama.",
"+ \"\"\"Extract entities using 2-pass approach for better categorization.",
" ",
"- Supports two response formats:",
"- 1. New format: {\"persons\":[], \"roles\":[], \"theories\":[], ...}",
"- 2. Legacy format: {\"entities\": [{\"name\": \"...\", \"type\": \"...\"}]}",
"+ Pass 1: Extract entity names from text",
"+ Pass 2: Categorize extracted entities",
"+",
"+ Falls back to single-pass if 2-pass prompts not available.",
" \"\"\"",
"+ # Try 2-pass approach first",
"+ pass1_data = db.get_prompt_by_name(\"entity_extraction_pass1\")",
"+ pass2_data = db.get_prompt_by_name(\"entity_extraction_pass2\")",
"+",
"+ if pass1_data and pass2_data:",
"+ return _extract_entities_2pass(text, pass1_data[\"content\"], pass2_data[\"content\"], model)",
"+",
"+ # Fallback to single-pass",
"+ return _extract_entities_single_pass(text, model)",
"+",
"+",
"+def _extract_entities_2pass(",
"+ text: str, pass1_template: str, pass2_template: str, model: str",
"+) -> list[dict]:",
"+ \"\"\"2-pass entity extraction: extract then categorize.\"\"\"",
"+ try:",
"+ # PASS 1: Extract entity names",
"+ prompt1 = pass1_template.replace(\"{text}\", text[:3000])",
"+ resp1, tok_in1, tok_out1, dur1 = _call_ollama(prompt1, model)",
"+",
"+ try:",
"+ result1 = json.loads(resp1)",
"+ raw_entities = result1.get(\"entities\", [])",
"+ except json.JSONDecodeError:",
"+ db.log(\"WARNING\", \"Failed to parse Pass 1 JSON\")",
"+ return []",
"+",
"+ # Validate: only keep entities that appear in text",
"+ valid_entities = [e for e in raw_entities if _validate_entity_in_text(e, text)]",
"+",
"+ if not valid_entities:",
"+ return []",
"+",
"+ protokoll.log_llm_call(",
"+ request=f\"[entity_extraction_pass1] {len(valid_entities)} entities\",",
"+ response=json.dumps(valid_entities[:10], ensure_ascii=False),",
"+ model_name=f\"ollama:{model}\",",
"+ tokens_input=tok_in1,",
"+ tokens_output=tok_out1,",
"+ duration_ms=dur1,",
"+ status=\"completed\",",
"+ )",
"+",
"+ # PASS 2: Categorize entities",
"+ entities_json = json.dumps(valid_entities, ensure_ascii=False)",
"+ prompt2 = pass2_template.replace(\"{entities}\", entities_json)",
"+ resp2, tok_in2, tok_out2, dur2 = _call_ollama(prompt2, model)",
"+",
"+ try:",
"+ result2 = json.loads(resp2)",
"+ categorized = result2.get(\"kategorisiert\", [])",
"+ except json.JSONDecodeError:",
"+ db.log(\"WARNING\", \"Failed to parse Pass 2 JSON\")",
"+ # Fallback: return uncategorized entities",
"+ return [{\"name\": e, \"type\": \"CONCEPT\", \"description\": None} for e in valid_entities]",
"+",
"+ protokoll.log_llm_call(",
"+ request=f\"[entity_extraction_pass2] categorize {len(valid_entities)} entities\",",
"+ response=resp2[:1000],",
"+ model_name=f\"ollama:{model}\",",
"+ tokens_input=tok_in2,",
"+ tokens_output=tok_out2,",
"+ duration_ms=dur2,",
"+ status=\"completed\",",
"+ )",
"+",
"+ # Normalize output",
"+ entities = []",
"+ for e in categorized:",
"+ if isinstance(e, dict) and \"name\" in e and \"type\" in e:",
"+ # Final validation",
"+ if _validate_entity_in_text(e[\"name\"], text):",
"+ entities.append({",
"+ \"name\": e[\"name\"],",
"+ \"type\": e[\"type\"].upper(),",
"+ \"description\": e.get(\"description\"),",
"+ })",
"+",
"+ return entities",
"+",
"+ except Exception as e:",
"+ db.log(\"ERROR\", f\"2-pass entity extraction failed: {e}\")",
"+ return []",
"+",
"+",
"+def _extract_entities_single_pass(text: str, model: str) -> list[dict]:",
"+ \"\"\"Single-pass entity extraction (legacy fallback).\"\"\"",
" prompt_data = db.get_prompt_by_use_case(\"entity_extraction\")",
" prompt_template = prompt_data[\"content\"] if prompt_data else None",
" "
]
},
{
"oldStart": 136,
"oldLines": 46,
"newStart": 243,
"newLines": 29,
"lines": [
" Text:",
" {text}\"\"\"",
" ",
"- # Support both {text} and {{TEXT}} placeholders",
" prompt = prompt_template.replace(\"{text}\", text[:3000]).replace(\"{{TEXT}}\", text[:3000])",
" ",
" try:",
"- start_time = time.time()",
"- response = requests.post(",
"- f\"{OLLAMA_HOST}\/api\/generate\",",
"- json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},",
"- timeout=120,",
"- )",
"- response.raise_for_status()",
"- data = response.json()",
"- duration_ms = int((time.time() - start_time) * 1000)",
"+ resp, tok_in, tok_out, dur = _call_ollama(prompt, model)",
" ",
"- response_text = data.get(\"response\", \"{}\")",
"-",
" protokoll.log_llm_call(",
" request=f\"[entity_extraction] {prompt[:500]}...\",",
"- response=response_text[:2000],",
"+ response=resp[:2000],",
" model_name=f\"ollama:{model}\",",
"- tokens_input=data.get(\"prompt_eval_count\", 0),",
"- tokens_output=data.get(\"eval_count\", 0),",
"- duration_ms=duration_ms,",
"+ tokens_input=tok_in,",
"+ tokens_output=tok_out,",
"+ duration_ms=dur,",
" status=\"completed\",",
" )",
" ",
" try:",
"- result = json.loads(response_text)",
"+ result = json.loads(resp)",
" return _normalize_entity_response(result, text)",
" except json.JSONDecodeError:",
" db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")",
" return []",
" except Exception as e:",
" db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")",
"- protokoll.log_llm_call(",
"- request=f\"[entity_extraction] {prompt[:500]}...\",",
"- model_name=f\"ollama:{model}\",",
"- status=\"error\",",
"- error_message=str(e),",
"- )",
" return []",
" ",
" "
]
}
],
"userModified": false,
"replaceAll": false
}
}