{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/test_taxonomy.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"\nTest script for Taxonomy Pipeline functionality.\nTests chunk and entity taxonomy assignments.\n\"\"\"\n\nimport sys\nimport os\n\n# Add pipeline directory to path\nsys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))\n\nfrom db import Database\n\n\ndef test_taxonomy_pipeline():\n \"\"\"Test the taxonomy pipeline with existing chunks.\"\"\"\n db = Database()\n\n if not db.connect():\n print(\"ERROR: Database connection failed\")\n return False\n\n print(\"=== Taxonomy Pipeline Test ===\\n\")\n\n # 1. Get taxonomy terms\n print(\"[1] Loading taxonomy terms...\")\n terms = db.get_taxonomy_terms()\n print(f\" Found {len(terms)} taxonomy terms\")\n for term in terms[:5]:\n print(f\" - {term['id']}: {term['name']}\")\n print()\n\n # 2. Get some chunks to test\n print(\"[2] Loading test chunks...\")\n cursor = db.execute(\n \"\"\"SELECT c.id, c.document_id, LEFT(c.content, 200) as content_preview\n FROM chunks c\n LIMIT 5\"\"\"\n )\n chunks = cursor.fetchall()\n cursor.close()\n print(f\" Found {len(chunks)} chunks for testing\")\n print()\n\n # 3. Assign taxonomy based on content keywords\n print(\"[3] Assigning taxonomy terms to chunks...\")\n keyword_map = {\n \"Coaching\": [\"coaching\", \"coach\", \"begleitung\"],\n \"Kommunikation\": [\"kommunikation\", \"fragen\", \"gespräch\", \"dialog\"],\n \"Methoden\": [\"methode\", \"werkzeug\", \"tool\", \"intervention\"],\n \"Theorie\": [\"theorie\", \"konzept\", \"modell\", \"ansatz\"],\n \"Prozess\": [\"prozess\", \"ablauf\", \"schritt\", \"phase\"],\n \"Organisation\": [\"team\", \"organisation\", \"gruppe\", \"zusammenarbeit\"],\n \"Entwicklung\": [\"entwicklung\", \"veränderung\", \"wachstum\"],\n }\n\n # Build term lookup\n term_lookup = {t[\"name\"]: t[\"id\"] for t in terms}\n\n assignments = 0\n for chunk in chunks:\n content_lower = chunk[\"content_preview\"].lower()\n chunk_id = chunk[\"id\"]\n\n for term_name, keywords in keyword_map.items():\n if term_name not in term_lookup:\n continue\n\n term_id = term_lookup[term_name]\n\n # Check if any keyword matches\n matches = sum(1 for kw in keywords if kw in content_lower)\n if matches > 0:\n # Calculate confidence based on matches\n confidence = min(0.5 + (matches * 0.15), 0.95)\n\n result = db.add_chunk_taxonomy(\n chunk_id=chunk_id,\n term_id=term_id,\n confidence=confidence,\n source=\"auto\",\n )\n\n if result:\n print(f\" Chunk {chunk_id} -> {term_name} (conf: {confidence:.2f})\")\n assignments += 1\n\n print(f\"\\n Created {assignments} new taxonomy assignments\")\n print()\n\n # 4. Verify mappings\n print(\"[4] Verifying taxonomy mappings...\")\n cursor = db.execute(\"SELECT COUNT(*) as count FROM chunk_taxonomy\")\n result = cursor.fetchone()\n cursor.close()\n print(f\" Total chunk_taxonomy entries: {result['count']}\")\n\n # 5. Show sample mappings\n print(\"\\n[5] Sample mappings with details:\")\n cursor = db.execute(\n \"\"\"SELECT ct.chunk_id, ct.confidence, ct.source, tt.name as term_name\n FROM chunk_taxonomy ct\n JOIN taxonomy_terms tt ON ct.taxonomy_term_id = tt.id\n ORDER BY ct.created_at DESC\n LIMIT 10\"\"\"\n )\n mappings = cursor.fetchall()\n cursor.close()\n\n for m in mappings:\n print(f\" Chunk {m['chunk_id']}: {m['term_name']} ({m['confidence']:.2f}, {m['source']})\")\n\n # 6. Test entity taxonomy (if entities exist)\n print(\"\\n[6] Checking entities for taxonomy assignment...\")\n cursor = db.execute(\"SELECT id, name, type FROM entities LIMIT 3\")\n entities = cursor.fetchall()\n cursor.close()\n\n if entities:\n print(f\" Found {len(entities)} entities\")\n for entity in entities:\n # Assign first matching taxonomy term\n if entity[\"type\"] and entity[\"type\"] in term_lookup:\n term_id = term_lookup[entity[\"type\"]]\n result = db.add_entity_taxonomy(\n entity_id=entity[\"id\"],\n term_id=term_id,\n relevance=0.8,\n validated=False,\n )\n if result:\n print(f\" Entity {entity['id']} ({entity['name']}) -> {entity['type']}\")\n else:\n print(\" No entities found to test\")\n\n db.disconnect()\n print(\"\\n=== Test Complete ===\")\n return True\n\n\nif __name__ == \"__main__\":\n success = test_taxonomy_pipeline()\n sys.exit(0 if success else 1)\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/test_taxonomy.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"\nTest script for Taxonomy Pipeline functionality.\nTests chunk and entity taxonomy assignments.\n\"\"\"\n\nimport sys\nimport os\n\n# Add pipeline directory to path\nsys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))\n\nfrom db import Database\n\n\ndef test_taxonomy_pipeline():\n \"\"\"Test the taxonomy pipeline with existing chunks.\"\"\"\n db = Database()\n\n if not db.connect():\n print(\"ERROR: Database connection failed\")\n return False\n\n print(\"=== Taxonomy Pipeline Test ===\\n\")\n\n # 1. Get taxonomy terms\n print(\"[1] Loading taxonomy terms...\")\n terms = db.get_taxonomy_terms()\n print(f\" Found {len(terms)} taxonomy terms\")\n for term in terms[:5]:\n print(f\" - {term['id']}: {term['name']}\")\n print()\n\n # 2. Get some chunks to test\n print(\"[2] Loading test chunks...\")\n cursor = db.execute(\n \"\"\"SELECT c.id, c.document_id, LEFT(c.content, 200) as content_preview\n FROM chunks c\n LIMIT 5\"\"\"\n )\n chunks = cursor.fetchall()\n cursor.close()\n print(f\" Found {len(chunks)} chunks for testing\")\n print()\n\n # 3. Assign taxonomy based on content keywords\n print(\"[3] Assigning taxonomy terms to chunks...\")\n keyword_map = {\n \"Coaching\": [\"coaching\", \"coach\", \"begleitung\"],\n \"Kommunikation\": [\"kommunikation\", \"fragen\", \"gespräch\", \"dialog\"],\n \"Methoden\": [\"methode\", \"werkzeug\", \"tool\", \"intervention\"],\n \"Theorie\": [\"theorie\", \"konzept\", \"modell\", \"ansatz\"],\n \"Prozess\": [\"prozess\", \"ablauf\", \"schritt\", \"phase\"],\n \"Organisation\": [\"team\", \"organisation\", \"gruppe\", \"zusammenarbeit\"],\n \"Entwicklung\": [\"entwicklung\", \"veränderung\", \"wachstum\"],\n }\n\n # Build term lookup\n term_lookup = {t[\"name\"]: t[\"id\"] for t in terms}\n\n assignments = 0\n for chunk in chunks:\n content_lower = chunk[\"content_preview\"].lower()\n chunk_id = chunk[\"id\"]\n\n for term_name, keywords in keyword_map.items():\n if term_name not in term_lookup:\n continue\n\n term_id = term_lookup[term_name]\n\n # Check if any keyword matches\n matches = sum(1 for kw in keywords if kw in content_lower)\n if matches > 0:\n # Calculate confidence based on matches\n confidence = min(0.5 + (matches * 0.15), 0.95)\n\n result = db.add_chunk_taxonomy(\n chunk_id=chunk_id,\n term_id=term_id,\n confidence=confidence,\n source=\"auto\",\n )\n\n if result:\n print(f\" Chunk {chunk_id} -> {term_name} (conf: {confidence:.2f})\")\n assignments += 1\n\n print(f\"\\n Created {assignments} new taxonomy assignments\")\n print()\n\n # 4. Verify mappings\n print(\"[4] Verifying taxonomy mappings...\")\n cursor = db.execute(\"SELECT COUNT(*) as count FROM chunk_taxonomy\")\n result = cursor.fetchone()\n cursor.close()\n print(f\" Total chunk_taxonomy entries: {result['count']}\")\n\n # 5. Show sample mappings\n print(\"\\n[5] Sample mappings with details:\")\n cursor = db.execute(\n \"\"\"SELECT ct.chunk_id, ct.confidence, ct.source, tt.name as term_name\n FROM chunk_taxonomy ct\n JOIN taxonomy_terms tt ON ct.taxonomy_term_id = tt.id\n ORDER BY ct.created_at DESC\n LIMIT 10\"\"\"\n )\n mappings = cursor.fetchall()\n cursor.close()\n\n for m in mappings:\n print(f\" Chunk {m['chunk_id']}: {m['term_name']} ({m['confidence']:.2f}, {m['source']})\")\n\n # 6. Test entity taxonomy (if entities exist)\n print(\"\\n[6] Checking entities for taxonomy assignment...\")\n cursor = db.execute(\"SELECT id, name, type FROM entities LIMIT 3\")\n entities = cursor.fetchall()\n cursor.close()\n\n if entities:\n print(f\" Found {len(entities)} entities\")\n for entity in entities:\n # Assign first matching taxonomy term\n if entity[\"type\"] and entity[\"type\"] in term_lookup:\n term_id = term_lookup[entity[\"type\"]]\n result = db.add_entity_taxonomy(\n entity_id=entity[\"id\"],\n term_id=term_id,\n relevance=0.8,\n validated=False,\n )\n if result:\n print(f\" Entity {entity['id']} ({entity['name']}) -> {entity['type']}\")\n else:\n print(\" No entities found to test\")\n\n db.disconnect()\n print(\"\\n=== Test Complete ===\")\n return True\n\n\nif __name__ == \"__main__\":\n success = test_taxonomy_pipeline()\n sys.exit(0 if success else 1)\n",
"structuredPatch": [],
"originalFile": null
}
}