fix_canonical_names.py
- Pfad:
/var/www/scripts/pipeline/fix_canonical_names.py - Namespace: pipeline
- Zeilen: 118 | Größe: 3,034 Bytes
- Geändert: 2025-12-25 09:31:24 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 98
- Dependencies: 90 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 6
- use os
- use re
- use sys
- use unicodedata
- use mysql.connector
- use config.DB_CONFIG
Funktionen 2
-
normalize_name()Zeile 16 -
main()Zeile 64
Code
#!/usr/bin/env python3
"""Fix missing canonical_name for all entities."""
import os
import re
import sys
import unicodedata
import mysql.connector
# Add pipeline directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from config import DB_CONFIG
def normalize_name(name: str) -> str:
"""Generate canonical_name from entity name.
Rules:
- Lowercase
- German umlauts: ä→ae, ö→oe, ü→ue, ß→ss
- Replace spaces with underscores
- Remove special characters except underscores
- Collapse multiple underscores
"""
if not name:
return ""
# Lowercase
result = name.lower()
# German umlauts
replacements = {
"ä": "ae",
"ö": "oe",
"ü": "ue",
"ß": "ss",
"Ä": "ae",
"Ö": "oe",
"Ü": "ue",
}
for old, new in replacements.items():
result = result.replace(old, new)
# Normalize unicode (handle accents etc.)
result = unicodedata.normalize("NFKD", result)
result = result.encode("ascii", "ignore").decode("ascii")
# Replace spaces and hyphens with underscores
result = re.sub(r"[\s\-]+", "_", result)
# Remove all non-alphanumeric except underscores
result = re.sub(r"[^a-z0-9_]", "", result)
# Collapse multiple underscores
result = re.sub(r"_+", "_", result)
# Strip leading/trailing underscores
result = result.strip("_")
return result
def main():
print("Connecting to database...")
# Override database to ki_content
config = DB_CONFIG.copy()
config["database"] = "ki_content"
conn = mysql.connector.connect(**config)
cursor = conn.cursor(dictionary=True)
# Get all entities without canonical_name
cursor.execute("""
SELECT id, name, canonical_name
FROM entities
WHERE canonical_name IS NULL OR canonical_name = ''
""")
entities = cursor.fetchall()
print(f"Found {len(entities)} entities without canonical_name")
if not entities:
print("Nothing to do!")
return
# Update each entity
updated = 0
for entity in entities:
canonical = normalize_name(entity["name"])
if canonical:
cursor.execute("UPDATE entities SET canonical_name = %s WHERE id = %s", (canonical, entity["id"]))
updated += 1
if updated <= 10:
print(f" [{entity['id']}] {entity['name']} -> {canonical}")
if updated > 10:
print(f" ... and {updated - 10} more")
conn.commit()
print(f"\nUpdated {updated} entities with canonical_name")
# Also update status to 'normalized' for entities with canonical_name
cursor.execute("""
UPDATE entities
SET status = 'normalized'
WHERE canonical_name IS NOT NULL AND canonical_name != ''
""")
conn.commit()
print("Set status='normalized' for all entities with canonical_name")
cursor.close()
conn.close()
print("Done!")
if __name__ == "__main__":
main()