{
"tool_response": {
"type": "text",
"file": {
"filePath": "\/var\/www\/scripts\/pipeline\/analyzers\/entity_extractor.py",
"content": "\ndef _call_ollama(prompt: str, model: str, timeout: int = 120) -> tuple[str, int, int, int]:\n \"\"\"Call Ollama API and return (response_text, tokens_in, tokens_out, duration_ms).\"\"\"\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=timeout,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n return (\n data.get(\"response\", \"{}\"),\n data.get(\"prompt_eval_count\", 0),\n data.get(\"eval_count\", 0),\n duration_ms,\n )\n\n\ndef extract_entities_ollama(text: str, model: str = \"gemma3:27b-it-qat\") -> list[dict]:\n \"\"\"Extract entities using 2-pass approach for better categorization.\n\n Pass 1: Extract entity names from text\n Pass 2: Categorize extracted entities\n Post: Normalize types using deterministic rules\n\n Falls back to single-pass if 2-pass prompts not available.\n \"\"\"\n # Try 2-pass approach first\n pass1_template = db.get_prompt(\"entity_extraction_pass1\")\n pass2_template = db.get_prompt(\"entity_extraction_pass2\")\n\n if pass1_template and pass2_template:\n entities = _extract_entities_2pass(text, pass1_template, pass2_template, model)\n else:\n # Fallback to single-pass\n entities = _extract_entities_single_pass(text, model)\n\n return entities\n\n\ndef _extract_entities_2pass(\n text: str, pass1_template: str, pass2_template: str, model: str\n) -> list[dict]:\n \"\"\"2-pass entity extraction: extract then categorize.\"\"\"\n try:\n # PASS 1: Extract entity names\n prompt1 = pass1_template.replace(\"{text}\", text[:3000])\n resp1, tok_in1, tok_out1, dur1 = _call_ollama(prompt1, model)\n\n try:\n result1 = json.loads(resp1)\n raw_entities = result1.get(\"entities\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 1 JSON\")\n return []\n\n # Validate: only keep entities that appear in text\n valid_entities = [e for e in raw_entities if _validate_entity_in_text(e, text)]\n\n if not valid_entities:\n return []\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass1] {len(valid_entities)} entities\",\n response=json.dumps(valid_entities[:10], ensure_ascii=False),\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in1,\n tokens_output=tok_out1,\n duration_ms=dur1,\n status=\"completed\",\n )\n\n # PASS 2: Categorize entities\n entities_json = json.dumps(valid_entities, ensure_ascii=False)\n prompt2 = pass2_template.replace(\"{entities}\", entities_json)\n resp2, tok_in2, tok_out2, dur2 = _call_ollama(prompt2, model)\n\n try:\n result2 = json.loads(resp2)\n categorized = result2.get(\"kategorisiert\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse Pass 2 JSON\")\n # Fallback: return uncategorized entities\n return [{\"name\": e, \"type\": \"CONCEPT\", \"description\": None} for e in valid_entities]\n\n protokoll.log_llm_call(\n request=f\"[entity_extraction_pass2] categorize {len(valid_entities)} entities\",\n response=resp2[:1000],\n model_name=f\"ollama:{model}\",\n tokens_input=tok_in2,\n tokens_output=tok_out2,\n duration_ms=dur2,\n status=\"completed\",\n )\n\n # Normalize output\n entities = []\n for e in categorized:",
"numLines": 100,
"startLine": 130,
"totalLines": 461
}
}
}