{
"tool_response": {
"filePath": "\/var\/www\/scripts\/pipeline\/analyze.py",
"oldString": "if __name__ == \"__main__\":\n test_text = \"\"\"\n Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n Sie wird heute weltweit in der Psychotherapie eingesetzt.\n \"\"\"\n\n print(\"Testing entity extraction with Ollama...\")\n entities = extract_entities_ollama(test_text)\n print(f\"Found {len(entities)} entities:\")\n for e in entities:\n print(f\" - {e['name']} ({e['type']})\")",
"newString": "def extract_statements(chunk_id: int, text: str, client=None) -> list[dict]:\n \"\"\"\n Extract SPO-triplets (Subject-Predicate-Object statements) from text.\n\n Args:\n chunk_id: ID of the chunk being analyzed\n text: Text content to extract statements from\n client: Optional Anthropic client (falls back to Ollama if None)\n\n Returns:\n List of extracted statements with entity linking\n \"\"\"\n prompt_template = db.get_prompt(\"statement_extraction\")\n\n if not prompt_template:\n db.log(\"WARNING\", \"statement_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Extrahiere alle faktischen Aussagen aus dem Text als SPO-Tripel.\n\nRegeln:\n- Subject: Eine benannte Entität (Person, Organisation, Konzept, Methode)\n- Predicate: Die Beziehung oder Eigenschaft (z.B. \"entwickelte\", \"basiert auf\", \"ist Teil von\")\n- Object: Eine Entität oder ein Literal-Wert\n\nAntworte NUR im JSON-Format:\n{\"statements\": [\n {\"subject\": \"Name der Subject-Entität\", \"predicate\": \"Beziehung\", \"object\": \"Name oder Wert\", \"confidence\": 0.0-1.0}\n]}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n tokens_in, tokens_out = 0, 0\n model_name = \"\"\n\n if client:\n message = client.messages.create(\n model=ANTHROPIC_MODEL,\n max_tokens=1500,\n messages=[{\"role\": \"user\", \"content\": prompt}],\n )\n response_text = message.content[0].text\n tokens_in = message.usage.input_tokens\n tokens_out = message.usage.output_tokens\n model_name = ANTHROPIC_MODEL\n else:\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\n \"model\": OLLAMA_CHAT_MODEL,\n \"prompt\": prompt,\n \"stream\": False,\n \"format\": \"json\",\n },\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n response_text = data.get(\"response\", \"{}\")\n tokens_in = data.get(\"prompt_eval_count\", 0)\n tokens_out = data.get(\"eval_count\", 0)\n model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n duration_ms = int((time.time() - start_time) * 1000)\n\n # Log to ki-protokoll\n protokoll.log_llm_call(\n request=f\"[statement_extraction] chunk_id={chunk_id}\",\n response=response_text[:2000],\n model_name=model_name,\n tokens_input=tokens_in,\n tokens_output=tokens_out,\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n # Parse JSON\n import re\n\n json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n if json_match:\n result = json.loads(json_match.group())\n return result.get(\"statements\", [])\n return []\n\n except Exception as e:\n db.log(\"ERROR\", f\"Statement extraction failed for chunk {chunk_id}: {e}\")\n protokoll.log_llm_call(\n request=f\"[statement_extraction] chunk_id={chunk_id}\",\n model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef find_entity_by_name(name: str) -> dict | None:\n \"\"\"\n Find entity by name or canonical_name.\n\n Args:\n name: Entity name to search for\n\n Returns:\n Entity dict with id, name, canonical_name or None\n \"\"\"\n name_lower = name.lower().strip()\n canonical = name_lower.replace(\" \", \"\").replace(\"-\", \"\")\n\n # Try exact canonical match first\n cursor = db.execute(\n \"SELECT id, name, canonical_name FROM entities WHERE canonical_name = %s LIMIT 1\",\n (canonical,),\n )\n result = cursor.fetchone()\n cursor.close()\n if result:\n return result\n\n # Try name match (case-insensitive)\n cursor = db.execute(\n \"SELECT id, name, canonical_name FROM entities WHERE LOWER(name) = %s LIMIT 1\",\n (name_lower,),\n )\n result = cursor.fetchone()\n cursor.close()\n if result:\n return result\n\n # Try partial canonical match\n cursor = db.execute(\n \"SELECT id, name, canonical_name FROM entities WHERE canonical_name LIKE %s LIMIT 1\",\n (f\"%{canonical}%\",),\n )\n result = cursor.fetchone()\n cursor.close()\n return result\n\n\ndef store_statements(chunk_id: int, statements: list[dict]) -> int:\n \"\"\"\n Store extracted statements in the database with entity linking.\n\n Args:\n chunk_id: ID of the source chunk\n statements: List of statement dicts with subject, predicate, object, confidence\n\n Returns:\n Number of successfully stored statements\n \"\"\"\n stored = 0\n\n for stmt in statements:\n try:\n subject_name = stmt.get(\"subject\", \"\").strip()\n predicate = stmt.get(\"predicate\", \"\").strip()\n object_value = stmt.get(\"object\", \"\").strip()\n confidence = float(stmt.get(\"confidence\", 0.8))\n\n if not subject_name or not predicate:\n continue\n\n # Find subject entity\n subject_entity = find_entity_by_name(subject_name)\n if not subject_entity:\n db.log(\"DEBUG\", f\"Subject entity not found: {subject_name}\")\n continue # Subject must be a known entity\n\n subject_entity_id = subject_entity[\"id\"]\n\n # Try to find object entity, otherwise use as literal\n object_entity_id = None\n object_literal = None\n\n if object_value:\n object_entity = find_entity_by_name(object_value)\n if object_entity:\n object_entity_id = object_entity[\"id\"]\n else:\n object_literal = object_value\n\n # Insert statement\n cursor = db.execute(\n \"\"\"INSERT INTO statements\n (subject_entity_id, predicate, object_entity_id, object_literal,\n chunk_id, confidence, status, created_at)\n VALUES (%s, %s, %s, %s, %s, %s, 'extracted', NOW())\"\"\",\n (\n subject_entity_id,\n predicate[:255], # Truncate to VARCHAR(255)\n object_entity_id,\n object_literal,\n chunk_id,\n confidence,\n ),\n )\n db.commit()\n cursor.close()\n stored += 1\n\n except Exception as e:\n db.log(\"WARNING\", f\"Failed to store statement: {e}\")\n\n return stored\n\n\ndef analyze_chunk_statements(chunk_id: int, content: str, client=None) -> int:\n \"\"\"\n Extract and store statements for a single chunk.\n\n Args:\n chunk_id: ID of the chunk\n content: Text content of the chunk\n client: Optional Anthropic client\n\n Returns:\n Number of statements stored\n \"\"\"\n statements = extract_statements(chunk_id, content, client)\n if statements:\n stored = store_statements(chunk_id, statements)\n db.log(\"INFO\", f\"Chunk {chunk_id}: {stored}\/{len(statements)} statements stored\")\n return stored\n return 0\n\n\ndef analyze_document_statements(document_id: int, client=None, progress=None) -> int:\n \"\"\"\n Extract statements from all chunks of a document.\n\n Args:\n document_id: ID of the document\n client: Optional Anthropic client\n progress: Optional PipelineProgress for logging\n\n Returns:\n Total number of statements stored\n \"\"\"\n cursor = db.execute(\n \"SELECT id, content FROM chunks WHERE document_id = %s ORDER BY chunk_index\",\n (document_id,),\n )\n chunks = cursor.fetchall()\n cursor.close()\n\n total = len(chunks)\n db.log(\"INFO\", f\"Extracting statements from {total} chunks\")\n if progress:\n progress.add_log(f\"Statements: Extrahiere aus {total} Chunks...\")\n\n total_stored = 0\n for i, chunk in enumerate(chunks, 1):\n if progress:\n progress.add_log(f\"Statements: Chunk {i}\/{total}...\")\n\n stored = analyze_chunk_statements(chunk[\"id\"], chunk[\"content\"], client)\n total_stored += stored\n\n db.log(\"INFO\", f\"Total statements stored: {total_stored}\")\n if progress:\n progress.add_log(f\"Statements: {total_stored} Aussagen extrahiert\")\n\n return total_stored\n\n\nif __name__ == \"__main__\":\n test_text = \"\"\"\n Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie.\n Diese Methode basiert auf Empathie, Kongruenz und bedingungsloser Wertschätzung.\n Sie wird heute weltweit in der Psychotherapie eingesetzt.\n \"\"\"\n\n print(\"Testing entity extraction with Ollama...\")\n entities = extract_entities_ollama(test_text)\n print(f\"Found {len(entities)} entities:\")\n for e in entities:\n print(f\" - {e['name']} ({e['type']})\")",
"originalFile": "\"\"\"\nSemantic analysis for KI-System Pipeline\nEntity extraction, taxonomy classification, ontology mapping.\n\"\"\"\n\nimport json\nimport os\nimport time\n\nimport requests\n\nfrom config import ANTHROPIC_API_KEY, ANTHROPIC_MODEL, OLLAMA_CHAT_MODEL, OLLAMA_HOST\nfrom db import db\nfrom protokoll import protokoll\n\n\ndef get_anthropic_client():\n \"\"\"Get Anthropic API client.\"\"\"\n try:\n import anthropic\n\n if ANTHROPIC_API_KEY:\n db.log(\"INFO\", \"Using Anthropic API (Claude)\")\n return anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)\n else:\n db.log(\"WARNING\", \"No Anthropic API key found, falling back to Ollama\")\n except ImportError:\n db.log(\"WARNING\", \"anthropic package not installed, falling back to Ollama\")\n return None\n\n\ndef extract_entities_ollama(text, model=\"gemma3:27b-it-qat\"):\n \"\"\"Extract entities using Ollama.\"\"\"\n # Load prompt from database\n prompt_template = db.get_prompt(\"entity_extraction\")\n\n if not prompt_template:\n db.log(\"WARNING\", \"entity_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Analysiere den Text und extrahiere wichtige Entitäten.\nKategorisiere als: PERSON, ORGANIZATION, CONCEPT, LOCATION\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"description\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": model, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n duration_ms = int((time.time() - start_time) * 1000)\n\n # Parse JSON from response\n response_text = data.get(\"response\", \"{}\")\n\n # Log to ki-protokoll\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=f\"ollama:{model}\",\n tokens_input=data.get(\"prompt_eval_count\", 0),\n tokens_output=data.get(\"eval_count\", 0),\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n try:\n entities = json.loads(response_text)\n return entities.get(\"entities\", [])\n except json.JSONDecodeError:\n db.log(\"WARNING\", \"Failed to parse entity JSON from Ollama\")\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Ollama entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=f\"ollama:{model}\",\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef extract_entities_anthropic(text, client):\n \"\"\"Extract entities using Anthropic Claude.\"\"\"\n # Get prompt from database\n prompt_template = db.get_prompt(\"entity_extraction\")\n\n if not prompt_template:\n prompt_template = \"\"\"Analysiere den folgenden deutschen Text und extrahiere alle wichtigen Entitäten.\n\nKategorisiere jede Entität als:\n- PERSON (Namen von Personen)\n- ORGANIZATION (Firmen, Institutionen, Gruppen)\n- CONCEPT (Fachbegriffe, Methoden, Theorien)\n- LOCATION (Orte, Länder)\n- DATE (Zeitangaben)\n- OTHER (Sonstiges)\n\nAntworte NUR im JSON-Format:\n{\"entities\": [{\"name\": \"...\", \"type\": \"...\", \"context\": \"kurzer Kontext der Erwähnung\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{TEXT}}\", text[:4000])\n\n try:\n start_time = time.time()\n message = client.messages.create(\n model=ANTHROPIC_MODEL, max_tokens=2000, messages=[{\"role\": \"user\", \"content\": prompt}]\n )\n duration_ms = int((time.time() - start_time) * 1000)\n\n response_text = message.content[0].text\n\n # Log to ki-protokoll\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=ANTHROPIC_MODEL,\n tokens_input=message.usage.input_tokens,\n tokens_output=message.usage.output_tokens,\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n # Extract JSON from response\n import re\n\n json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n if json_match:\n entities = json.loads(json_match.group())\n return entities.get(\"entities\", [])\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Anthropic entity extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[entity_extraction] {prompt[:500]}...\",\n model_name=ANTHROPIC_MODEL,\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef extract_relations(text, entities, client=None):\n \"\"\"Extract relations between entities.\"\"\"\n if not entities or len(entities) < 2:\n return []\n\n entity_names = [e[\"name\"] for e in entities[:20]]\n\n # Load prompt from database\n prompt_template = db.get_prompt(\"relation_extraction\")\n\n if not prompt_template:\n db.log(\"WARNING\", \"relation_extraction prompt not found in DB, using fallback\")\n prompt_template = \"\"\"Identifiziere Beziehungen zwischen Entitäten.\nEntitäten: {{ENTITIES}}\nBeziehungstypen: DEVELOPED_BY, RELATED_TO, PART_OF, USED_IN, BASED_ON\nAntworte NUR im JSON-Format:\n{\"relations\": [{\"source\": \"...\", \"relation\": \"...\", \"target\": \"...\"}]}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{ENTITIES}}\", \", \".join(entity_names))\n prompt = prompt.replace(\"{{TEXT}}\", text[:3000])\n\n try:\n start_time = time.time()\n tokens_in, tokens_out = 0, 0\n model_name = \"\"\n\n if client:\n message = client.messages.create(\n model=ANTHROPIC_MODEL, max_tokens=1000, messages=[{\"role\": \"user\", \"content\": prompt}]\n )\n response_text = message.content[0].text\n tokens_in = message.usage.input_tokens\n tokens_out = message.usage.output_tokens\n model_name = ANTHROPIC_MODEL\n else:\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=120,\n )\n response.raise_for_status()\n data = response.json()\n response_text = data.get(\"response\", \"{}\")\n tokens_in = data.get(\"prompt_eval_count\", 0)\n tokens_out = data.get(\"eval_count\", 0)\n model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n duration_ms = int((time.time() - start_time) * 1000)\n\n # Log to ki-protokoll\n protokoll.log_llm_call(\n request=f\"[relation_extraction] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=model_name,\n tokens_input=tokens_in,\n tokens_output=tokens_out,\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n import re\n\n json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n if json_match:\n data = json.loads(json_match.group())\n return data.get(\"relations\", [])\n return []\n except Exception as e:\n db.log(\"ERROR\", f\"Relation extraction failed: {e}\")\n protokoll.log_llm_call(\n request=f\"[relation_extraction] {prompt[:500]}...\",\n model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",\n status=\"error\",\n error_message=str(e),\n )\n return []\n\n\ndef classify_taxonomy(text, client=None):\n \"\"\"Classify text into taxonomy categories.\"\"\"\n prompt_template = db.get_prompt(\"taxonomy_classification\")\n\n if not prompt_template:\n prompt_template = \"\"\"Klassifiziere den folgenden Text in passende Kategorien.\n\nWähle aus diesen Hauptkategorien:\n- Methoden (Therapiemethoden, Techniken)\n- Theorie (Konzepte, Modelle, Grundlagen)\n- Praxis (Anwendung, Fallbeispiele)\n- Organisation (Strukturen, Prozesse)\n- Kommunikation (Gesprächsführung, Interaktion)\n- Entwicklung (Persönliche Entwicklung, Veränderung)\n\nAntworte NUR im JSON-Format:\n{\"categories\": [\"...\", \"...\"], \"confidence\": 0.0-1.0}\n\nText:\n{{TEXT}}\"\"\"\n\n prompt = prompt_template.replace(\"{{TEXT}}\", text[:2000])\n\n try:\n start_time = time.time()\n tokens_in, tokens_out = 0, 0\n model_name = \"\"\n\n if client:\n message = client.messages.create(\n model=ANTHROPIC_MODEL, max_tokens=500, messages=[{\"role\": \"user\", \"content\": prompt}]\n )\n response_text = message.content[0].text\n tokens_in = message.usage.input_tokens\n tokens_out = message.usage.output_tokens\n model_name = ANTHROPIC_MODEL\n else:\n response = requests.post(\n f\"{OLLAMA_HOST}\/api\/generate\",\n json={\"model\": OLLAMA_CHAT_MODEL, \"prompt\": prompt, \"stream\": False, \"format\": \"json\"},\n timeout=60,\n )\n response.raise_for_status()\n data = response.json()\n response_text = data.get(\"response\", \"{}\")\n tokens_in = data.get(\"prompt_eval_count\", 0)\n tokens_out = data.get(\"eval_count\", 0)\n model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"\n\n duration_ms = int((time.time() - start_time) * 1000)\n\n # Log to ki-protokoll\n protokoll.log_llm_call(\n request=f\"[taxonomy_classification] {prompt[:500]}...\",\n response=response_text[:2000],\n model_name=model_name,\n tokens_input=tokens_in,\n tokens_output=tokens_out,\n duration_ms=duration_ms,\n status=\"completed\",\n )\n\n import re\n\n json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)\n if json_m... [TRUNCATED-563c47d6afac0dae]",
"structuredPatch": [
{
"oldStart": 665,
"oldLines": 6,
"newStart": 665,
"newLines": 274,
"lines": [
" return analyzed",
" ",
" ",
"+def extract_statements(chunk_id: int, text: str, client=None) -> list[dict]:",
"+ \"\"\"",
"+ Extract SPO-triplets (Subject-Predicate-Object statements) from text.",
"+",
"+ Args:",
"+ chunk_id: ID of the chunk being analyzed",
"+ text: Text content to extract statements from",
"+ client: Optional Anthropic client (falls back to Ollama if None)",
"+",
"+ Returns:",
"+ List of extracted statements with entity linking",
"+ \"\"\"",
"+ prompt_template = db.get_prompt(\"statement_extraction\")",
"+",
"+ if not prompt_template:",
"+ db.log(\"WARNING\", \"statement_extraction prompt not found in DB, using fallback\")",
"+ prompt_template = \"\"\"Extrahiere alle faktischen Aussagen aus dem Text als SPO-Tripel.",
"+",
"+Regeln:",
"+- Subject: Eine benannte Entität (Person, Organisation, Konzept, Methode)",
"+- Predicate: Die Beziehung oder Eigenschaft (z.B. \"entwickelte\", \"basiert auf\", \"ist Teil von\")",
"+- Object: Eine Entität oder ein Literal-Wert",
"+",
"+Antworte NUR im JSON-Format:",
"+{\"statements\": [",
"+ {\"subject\": \"Name der Subject-Entität\", \"predicate\": \"Beziehung\", \"object\": \"Name oder Wert\", \"confidence\": 0.0-1.0}",
"+]}",
"+",
"+Text:",
"+{{TEXT}}\"\"\"",
"+",
"+ prompt = prompt_template.replace(\"{{TEXT}}\", text[:3000])",
"+",
"+ try:",
"+ start_time = time.time()",
"+ tokens_in, tokens_out = 0, 0",
"+ model_name = \"\"",
"+",
"+ if client:",
"+ message = client.messages.create(",
"+ model=ANTHROPIC_MODEL,",
"+ max_tokens=1500,",
"+ messages=[{\"role\": \"user\", \"content\": prompt}],",
"+ )",
"+ response_text = message.content[0].text",
"+ tokens_in = message.usage.input_tokens",
"+ tokens_out = message.usage.output_tokens",
"+ model_name = ANTHROPIC_MODEL",
"+ else:",
"+ response = requests.post(",
"+ f\"{OLLAMA_HOST}\/api\/generate\",",
"+ json={",
"+ \"model\": OLLAMA_CHAT_MODEL,",
"+ \"prompt\": prompt,",
"+ \"stream\": False,",
"+ \"format\": \"json\",",
"+ },",
"+ timeout=120,",
"+ )",
"+ response.raise_for_status()",
"+ data = response.json()",
"+ response_text = data.get(\"response\", \"{}\")",
"+ tokens_in = data.get(\"prompt_eval_count\", 0)",
"+ tokens_out = data.get(\"eval_count\", 0)",
"+ model_name = f\"ollama:{OLLAMA_CHAT_MODEL}\"",
"+",
"+ duration_ms = int((time.time() - start_time) * 1000)",
"+",
"+ # Log to ki-protokoll",
"+ protokoll.log_llm_call(",
"+ request=f\"[statement_extraction] chunk_id={chunk_id}\",",
"+ response=response_text[:2000],",
"+ model_name=model_name,",
"+ tokens_input=tokens_in,",
"+ tokens_output=tokens_out,",
"+ duration_ms=duration_ms,",
"+ status=\"completed\",",
"+ )",
"+",
"+ # Parse JSON",
"+ import re",
"+",
"+ json_match = re.search(r\"\\{[\\s\\S]*\\}\", response_text)",
"+ if json_match:",
"+ result = json.loads(json_match.group())",
"+ return result.get(\"statements\", [])",
"+ return []",
"+",
"+ except Exception as e:",
"+ db.log(\"ERROR\", f\"Statement extraction failed for chunk {chunk_id}: {e}\")",
"+ protokoll.log_llm_call(",
"+ request=f\"[statement_extraction] chunk_id={chunk_id}\",",
"+ model_name=ANTHROPIC_MODEL if client else f\"ollama:{OLLAMA_CHAT_MODEL}\",",
"+ status=\"error\",",
"+ error_message=str(e),",
"+ )",
"+ return []",
"+",
"+",
"+def find_entity_by_name(name: str) -> dict | None:",
"+ \"\"\"",
"+ Find entity by name or canonical_name.",
"+",
"+ Args:",
"+ name: Entity name to search for",
"+",
"+ Returns:",
"+ Entity dict with id, name, canonical_name or None",
"+ \"\"\"",
"+ name_lower = name.lower().strip()",
"+ canonical = name_lower.replace(\" \", \"\").replace(\"-\", \"\")",
"+",
"+ # Try exact canonical match first",
"+ cursor = db.execute(",
"+ \"SELECT id, name, canonical_name FROM entities WHERE canonical_name = %s LIMIT 1\",",
"+ (canonical,),",
"+ )",
"+ result = cursor.fetchone()",
"+ cursor.close()",
"+ if result:",
"+ return result",
"+",
"+ # Try name match (case-insensitive)",
"+ cursor = db.execute(",
"+ \"SELECT id, name, canonical_name FROM entities WHERE LOWER(name) = %s LIMIT 1\",",
"+ (name_lower,),",
"+ )",
"+ result = cursor.fetchone()",
"+ cursor.close()",
"+ if result:",
"+ return result",
"+",
"+ # Try partial canonical match",
"+ cursor = db.execute(",
"+ \"SELECT id, name, canonical_name FROM entities WHERE canonical_name LIKE %s LIMIT 1\",",
"+ (f\"%{canonical}%\",),",
"+ )",
"+ result = cursor.fetchone()",
"+ cursor.close()",
"+ return result",
"+",
"+",
"+def store_statements(chunk_id: int, statements: list[dict]) -> int:",
"+ \"\"\"",
"+ Store extracted statements in the database with entity linking.",
"+",
"+ Args:",
"+ chunk_id: ID of the source chunk",
"+ statements: List of statement dicts with subject, predicate, object, confidence",
"+",
"+ Returns:",
"+ Number of successfully stored statements",
"+ \"\"\"",
"+ stored = 0",
"+",
"+ for stmt in statements:",
"+ try:",
"+ subject_name = stmt.get(\"subject\", \"\").strip()",
"+ predicate = stmt.get(\"predicate\", \"\").strip()",
"+ object_value = stmt.get(\"object\", \"\").strip()",
"+ confidence = float(stmt.get(\"confidence\", 0.8))",
"+",
"+ if not subject_name or not predicate:",
"+ continue",
"+",
"+ # Find subject entity",
"+ subject_entity = find_entity_by_name(subject_name)",
"+ if not subject_entity:",
"+ db.log(\"DEBUG\", f\"Subject entity not found: {subject_name}\")",
"+ continue # Subject must be a known entity",
"+",
"+ subject_entity_id = subject_entity[\"id\"]",
"+",
"+ # Try to find object entity, otherwise use as literal",
"+ object_entity_id = None",
"+ object_literal = None",
"+",
"+ if object_value:",
"+ object_entity = find_entity_by_name(object_value)",
"+ if object_entity:",
"+ object_entity_id = object_entity[\"id\"]",
"+ else:",
"+ object_literal = object_value",
"+",
"+ # Insert statement",
"+ cursor = db.execute(",
"+ \"\"\"INSERT INTO statements",
"+ (subject_entity_id, predicate, object_entity_id, object_literal,",
"+ chunk_id, confidence, status, created_at)",
"+ VALUES (%s, %s, %s, %s, %s, %s, 'extracted', NOW())\"\"\",",
"+ (",
"+ subject_entity_id,",
"+ predicate[:255], # Truncate to VARCHAR(255)",
"+ object_entity_id,",
"+ object_literal,",
"+ chunk_id,",
"+ confidence,",
"+ ),",
"+ )",
"+ db.commit()",
"+ cursor.close()",
"+ stored += 1",
"+",
"+ except Exception as e:",
"+ db.log(\"WARNING\", f\"Failed to store statement: {e}\")",
"+",
"+ return stored",
"+",
"+",
"+def analyze_chunk_statements(chunk_id: int, content: str, client=None) -> int:",
"+ \"\"\"",
"+ Extract and store statements for a single chunk.",
"+",
"+ Args:",
"+ chunk_id: ID of the chunk",
"+ content: Text content of the chunk",
"+ client: Optional Anthropic client",
"+",
"+ Returns:",
"+ Number of statements stored",
"+ \"\"\"",
"+ statements = extract_statements(chunk_id, content, client)",
"+ if statements:",
"+ stored = store_statements(chunk_id, statements)",
"+ db.log(\"INFO\", f\"Chunk {chunk_id}: {stored}\/{len(statements)} statements stored\")",
"+ return stored",
"+ return 0",
"+",
"+",
"+def analyze_document_statements(document_id: int, client=None, progress=None) -> int:",
"+ \"\"\"",
"+ Extract statements from all chunks of a document.",
"+",
"+ Args:",
"+ document_id: ID of the document",
"+ client: Optional Anthropic client",
"+ progress: Optional PipelineProgress for logging",
"+",
"+ Returns:",
"+ Total number of statements stored",
"+ \"\"\"",
"+ cursor = db.execute(",
"+ \"SELECT id, content FROM chunks WHERE document_id = %s ORDER BY chunk_index\",",
"+ (document_id,),",
"+ )",
"+ chunks = cursor.fetchall()",
"+ cursor.close()",
"+",
"+ total = len(chunks)",
"+ db.log(\"INFO\", f\"Extracting statements from {total} chunks\")",
"+ if progress:",
"+ progress.add_log(f\"Statements: Extrahiere aus {total} Chunks...\")",
"+",
"+ total_stored = 0",
"+ for i, chunk in enumerate(chunks, 1):",
"+ if progress:",
"+ progress.add_log(f\"Statements: Chunk {i}\/{total}...\")",
"+",
"+ stored = analyze_chunk_statements(chunk[\"id\"], chunk[\"content\"], client)",
"+ total_stored += stored",
"+",
"+ db.log(\"INFO\", f\"Total statements stored: {total_stored}\")",
"+ if progress:",
"+ progress.add_log(f\"Statements: {total_stored} Aussagen extrahiert\")",
"+",
"+ return total_stored",
"+",
"+",
" if __name__ == \"__main__\":",
" test_text = \"\"\"",
" Carl Rogers entwickelte die klientenzentrierte Gesprächstherapie."
]
}
],
"userModified": false,
"replaceAll": false
}
}