fix_canonical_names.py

Code Hygiene Score: 98

Keine Issues gefunden.

Dependencies 6

Funktionen 2

Code

#!/usr/bin/env python3
"""Fix missing canonical_name for all entities."""

import os
import re
import sys
import unicodedata

import mysql.connector

# Add pipeline directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from config import DB_CONFIG


def normalize_name(name: str) -> str:
    """Generate canonical_name from entity name.

    Rules:
    - Lowercase
    - German umlauts: ä→ae, ö→oe, ü→ue, ß→ss
    - Replace spaces with underscores
    - Remove special characters except underscores
    - Collapse multiple underscores
    """
    if not name:
        return ""

    # Lowercase
    result = name.lower()

    # German umlauts
    replacements = {
        "ä": "ae",
        "ö": "oe",
        "ü": "ue",
        "ß": "ss",
        "Ä": "ae",
        "Ö": "oe",
        "Ü": "ue",
    }
    for old, new in replacements.items():
        result = result.replace(old, new)

    # Normalize unicode (handle accents etc.)
    result = unicodedata.normalize("NFKD", result)
    result = result.encode("ascii", "ignore").decode("ascii")

    # Replace spaces and hyphens with underscores
    result = re.sub(r"[\s\-]+", "_", result)

    # Remove all non-alphanumeric except underscores
    result = re.sub(r"[^a-z0-9_]", "", result)

    # Collapse multiple underscores
    result = re.sub(r"_+", "_", result)

    # Strip leading/trailing underscores
    result = result.strip("_")

    return result


def main():
    print("Connecting to database...")
    # Override database to ki_content
    config = DB_CONFIG.copy()
    config["database"] = "ki_content"
    conn = mysql.connector.connect(**config)
    cursor = conn.cursor(dictionary=True)

    # Get all entities without canonical_name
    cursor.execute("""
        SELECT id, name, canonical_name
        FROM entities
        WHERE canonical_name IS NULL OR canonical_name = ''
    """)
    entities = cursor.fetchall()

    print(f"Found {len(entities)} entities without canonical_name")

    if not entities:
        print("Nothing to do!")
        return

    # Update each entity
    updated = 0
    for entity in entities:
        canonical = normalize_name(entity["name"])
        if canonical:
            cursor.execute("UPDATE entities SET canonical_name = %s WHERE id = %s", (canonical, entity["id"]))
            updated += 1
            if updated <= 10:
                print(f"  [{entity['id']}] {entity['name']} -> {canonical}")

    if updated > 10:
        print(f"  ... and {updated - 10} more")

    conn.commit()
    print(f"\nUpdated {updated} entities with canonical_name")

    # Also update status to 'normalized' for entities with canonical_name
    cursor.execute("""
        UPDATE entities
        SET status = 'normalized'
        WHERE canonical_name IS NOT NULL AND canonical_name != ''
    """)
    conn.commit()
    print("Set status='normalized' for all entities with canonical_name")

    cursor.close()
    conn.close()
    print("Done!")


if __name__ == "__main__":
    main()
← Übersicht