{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/fix_canonical_names.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"Fix missing canonical_name for all entities.\"\"\"\n\nimport re\nimport unicodedata\nimport mariadb\nimport os\nimport sys\n\n# Add pipeline directory to path\nsys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))\nfrom config import get_db_password\n\nDB_CONFIG = {\n \"host\": \"localhost\",\n \"database\": \"ki_content\",\n \"user\": \"root\",\n \"password\": get_db_password(),\n}\n\n\ndef normalize_name(name: str) -> str:\n \"\"\"Generate canonical_name from entity name.\n\n Rules:\n - Lowercase\n - German umlauts: ä→ae, ö→oe, ü→ue, ß→ss\n - Replace spaces with underscores\n - Remove special characters except underscores\n - Collapse multiple underscores\n \"\"\"\n if not name:\n return \"\"\n\n # Lowercase\n result = name.lower()\n\n # German umlauts\n replacements = {\n 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',\n 'Ä': 'ae', 'Ö': 'oe', 'Ü': 'ue',\n }\n for old, new in replacements.items():\n result = result.replace(old, new)\n\n # Normalize unicode (handle accents etc.)\n result = unicodedata.normalize('NFKD', result)\n result = result.encode('ascii', 'ignore').decode('ascii')\n\n # Replace spaces and hyphens with underscores\n result = re.sub(r'[\\s\\-]+', '_', result)\n\n # Remove all non-alphanumeric except underscores\n result = re.sub(r'[^a-z0-9_]', '', result)\n\n # Collapse multiple underscores\n result = re.sub(r'_+', '_', result)\n\n # Strip leading\/trailing underscores\n result = result.strip('_')\n\n return result\n\n\ndef main():\n print(\"Connecting to database...\")\n conn = mariadb.connect(**DB_CONFIG)\n cursor = conn.cursor(dictionary=True)\n\n # Get all entities without canonical_name\n cursor.execute(\"\"\"\n SELECT id, name, canonical_name\n FROM entities\n WHERE canonical_name IS NULL OR canonical_name = ''\n \"\"\")\n entities = cursor.fetchall()\n\n print(f\"Found {len(entities)} entities without canonical_name\")\n\n if not entities:\n print(\"Nothing to do!\")\n return\n\n # Update each entity\n updated = 0\n for entity in entities:\n canonical = normalize_name(entity['name'])\n if canonical:\n cursor.execute(\n \"UPDATE entities SET canonical_name = %s WHERE id = %s\",\n (canonical, entity['id'])\n )\n updated += 1\n if updated <= 10:\n print(f\" [{entity['id']}] {entity['name']} -> {canonical}\")\n\n if updated > 10:\n print(f\" ... and {updated - 10} more\")\n\n conn.commit()\n print(f\"\\nUpdated {updated} entities with canonical_name\")\n\n # Also update status to 'normalized' for entities with canonical_name\n cursor.execute(\"\"\"\n UPDATE entities\n SET status = 'normalized'\n WHERE canonical_name IS NOT NULL AND canonical_name != ''\n \"\"\")\n conn.commit()\n print(f\"Set status='normalized' for all entities with canonical_name\")\n\n cursor.close()\n conn.close()\n print(\"Done!\")\n\n\nif __name__ == \"__main__\":\n main()\n"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/fix_canonical_names.py",
"content": "#!\/usr\/bin\/env python3\n\"\"\"Fix missing canonical_name for all entities.\"\"\"\n\nimport re\nimport unicodedata\nimport mariadb\nimport os\nimport sys\n\n# Add pipeline directory to path\nsys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))\nfrom config import get_db_password\n\nDB_CONFIG = {\n \"host\": \"localhost\",\n \"database\": \"ki_content\",\n \"user\": \"root\",\n \"password\": get_db_password(),\n}\n\n\ndef normalize_name(name: str) -> str:\n \"\"\"Generate canonical_name from entity name.\n\n Rules:\n - Lowercase\n - German umlauts: ä→ae, ö→oe, ü→ue, ß→ss\n - Replace spaces with underscores\n - Remove special characters except underscores\n - Collapse multiple underscores\n \"\"\"\n if not name:\n return \"\"\n\n # Lowercase\n result = name.lower()\n\n # German umlauts\n replacements = {\n 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',\n 'Ä': 'ae', 'Ö': 'oe', 'Ü': 'ue',\n }\n for old, new in replacements.items():\n result = result.replace(old, new)\n\n # Normalize unicode (handle accents etc.)\n result = unicodedata.normalize('NFKD', result)\n result = result.encode('ascii', 'ignore').decode('ascii')\n\n # Replace spaces and hyphens with underscores\n result = re.sub(r'[\\s\\-]+', '_', result)\n\n # Remove all non-alphanumeric except underscores\n result = re.sub(r'[^a-z0-9_]', '', result)\n\n # Collapse multiple underscores\n result = re.sub(r'_+', '_', result)\n\n # Strip leading\/trailing underscores\n result = result.strip('_')\n\n return result\n\n\ndef main():\n print(\"Connecting to database...\")\n conn = mariadb.connect(**DB_CONFIG)\n cursor = conn.cursor(dictionary=True)\n\n # Get all entities without canonical_name\n cursor.execute(\"\"\"\n SELECT id, name, canonical_name\n FROM entities\n WHERE canonical_name IS NULL OR canonical_name = ''\n \"\"\")\n entities = cursor.fetchall()\n\n print(f\"Found {len(entities)} entities without canonical_name\")\n\n if not entities:\n print(\"Nothing to do!\")\n return\n\n # Update each entity\n updated = 0\n for entity in entities:\n canonical = normalize_name(entity['name'])\n if canonical:\n cursor.execute(\n \"UPDATE entities SET canonical_name = %s WHERE id = %s\",\n (canonical, entity['id'])\n )\n updated += 1\n if updated <= 10:\n print(f\" [{entity['id']}] {entity['name']} -> {canonical}\")\n\n if updated > 10:\n print(f\" ... and {updated - 10} more\")\n\n conn.commit()\n print(f\"\\nUpdated {updated} entities with canonical_name\")\n\n # Also update status to 'normalized' for entities with canonical_name\n cursor.execute(\"\"\"\n UPDATE entities\n SET status = 'normalized'\n WHERE canonical_name IS NOT NULL AND canonical_name != ''\n \"\"\")\n conn.commit()\n print(f\"Set status='normalized' for all entities with canonical_name\")\n\n cursor.close()\n conn.close()\n print(\"Done!\")\n\n\nif __name__ == \"__main__\":\n main()\n",
"structuredPatch": [],
"originalFile": null
}
}