format_checker.py

Pfad: /var/www/scripts/pipeline/generators/format_checker.py
Namespace: pipeline
Zeilen: 408 | Größe: 13,798 Bytes
Geändert: 2025-12-31 02:22:45 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 86

Dependencies: 100 (25%)
LOC: 30 (20%)
Methods: 100 (20%)
Secrets: 100 (15%)
Classes: 100 (10%)
Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 5

use re
use sys
use typing.Any
use json
use db.db
Funktionen 8

extract_rules() Zeile 34
check_emojis() Zeile 115
check_markdown() Zeile 144
check_gedankenstriche() Zeile 192
check_hashtags() Zeile 229
check_ausrufezeichen() Zeile 240
check_formatting() Zeile 255
check_order_formatting() Zeile 348
Code

"""
Deterministic Format Checker - Code-based formatting validation.

Replaces/supplements LLM-based Formatierungsprüfer for reliable character-level checks.
LLMs hallucinate on character detection; this module provides deterministic results.
"""

import re
import sys
from typing import Any

sys.path.insert(0, "/var/www/scripts/pipeline")

# Common emoji ranges (simplified - covers most used emojis)
EMOJI_PATTERN = re.compile(
    "["
    "\U0001f300-\U0001f9ff"  # Miscellaneous Symbols and Pictographs, Emoticons, etc.
    "\U00002702-\U000027b0"  # Dingbats
    "\U0001f600-\U0001f64f"  # Emoticons
    "\U0001f680-\U0001f6ff"  # Transport and Map
    "\U00002600-\U000026ff"  # Misc symbols
    "\U00002b50-\U00002b55"  # Stars
    "\U000023e9-\U000023f3"  # Media symbols
    "\U0000200d"  # Zero width joiner
    "\U0000fe0f"  # Variation selector
    "]+",
    flags=re.UNICODE,
)

# Special bullets that count as emojis in plain text context
BULLET_EMOJIS = ["▪️", "▫️", "◾", "◽", "●", "○", "◆", "◇", "★", "☆", "✓", "✗", "✔", "✘", "→", "➡", "➔"]


def extract_rules(structure_config: dict | None, profile_config: dict | None) -> dict:
    """
    Extract formatting rules from structure and profile configs.

    Args:
        structure_config: The structure configuration dict (from content_config.content)
        profile_config: The author profile configuration dict

    Returns:
        Dict with boolean flags for each rule type
    """
    rules = {
        "emojis_verboten": False,
        "markdown_verboten": False,
        "fettschrift_verboten": False,
        "kursiv_verboten": False,
        "header_verboten": False,
        "gedankenstriche_verboten": False,
        "hashtags_verboten": False,
        "ausrufezeichen_sparsam": False,
        "output_format": "markdown",
    }

    # Extract from structure
    if structure_config:
        # Check ausgabe.format
        ausgabe = structure_config.get("ausgabe", {})
        rules["output_format"] = ausgabe.get("format", "markdown")

        # If plain text, most formatting is implicitly forbidden
        if rules["output_format"] == "reiner Text":
            rules["markdown_verboten"] = True
            rules["fettschrift_verboten"] = True
            rules["kursiv_verboten"] = True
            rules["header_verboten"] = True

        # Check formatierung section
        fmt = structure_config.get("formatierung", {})
        if isinstance(fmt, dict):
            if "verboten" in str(fmt.get("emojis", "")).lower():
                rules["emojis_verboten"] = True
            if "verboten" in str(fmt.get("fettschrift", "")).lower():
                rules["fettschrift_verboten"] = True
            if "verboten" in str(fmt.get("markdown", "")).lower():
                rules["markdown_verboten"] = True
                rules["fettschrift_verboten"] = True
                rules["kursiv_verboten"] = True
                rules["header_verboten"] = True
            if fmt.get("hashtags") == "keine":
                rules["hashtags_verboten"] = True

        # Check verboten array
        verboten = structure_config.get("verboten", [])
        for v in verboten:
            v_lower = v.lower()
            if "emoji" in v_lower:
                rules["emojis_verboten"] = True
            if "fettschrift" in v_lower or "markdown" in v_lower:
                rules["fettschrift_verboten"] = True
                rules["markdown_verboten"] = True
            if "hashtag" in v_lower:
                rules["hashtags_verboten"] = True

    # Extract from profile
    if profile_config:
        # Check grammatik_und_satzbau section
        grammatik = profile_config.get("grammatik_und_satzbau", {})
        if grammatik.get("gedankenstriche") == "verboten":
            rules["gedankenstriche_verboten"] = True

        # Check formatierung section in profile
        fmt = profile_config.get("formatierung", {})
        if isinstance(fmt, dict):
            if "verboten" in str(fmt.get("emojis", "")).lower():
                rules["emojis_verboten"] = True
            if "vermeiden" in str(fmt.get("ausrufezeichen", "")).lower():
                rules["ausrufezeichen_sparsam"] = True

    return rules


def check_emojis(text: str) -> list[dict]:
    """Check for emojis in text."""
    issues = []

    # Check regex pattern
    matches = EMOJI_PATTERN.findall(text)
    if matches:
        for match in matches[:5]:  # Limit to first 5
            pos = text.find(match)
            issues.append(
                {"type": "emoji", "char": match, "position": pos, "context": text[max(0, pos - 20) : pos + 20]}
            )

    # Check bullet emojis
    for bullet in BULLET_EMOJIS:
        if bullet in text:
            pos = text.find(bullet)
            issues.append(
                {
                    "type": "bullet_emoji",
                    "char": bullet,
                    "position": pos,
                    "context": text[max(0, pos - 20) : pos + 20],
                }
            )

    return issues


def check_markdown(
    text: str, check_bold: bool = True, check_italic: bool = True, check_headers: bool = True
) -> list[dict]:
    """Check for markdown formatting in text."""
    issues = []

    # Bold: **text** or __text__
    if check_bold:
        bold_matches = re.finditer(r"\*\*(.+?)\*\*", text)
        for m in bold_matches:
            issues.append(
                {
                    "type": "bold",
                    "match": m.group(0)[:30],
                    "position": m.start(),
                    "context": text[max(0, m.start() - 10) : m.end() + 10],
                }
            )

        bold_matches2 = re.finditer(r"__(.+?)__", text)
        for m in bold_matches2:
            issues.append({"type": "bold", "match": m.group(0)[:30], "position": m.start()})

    # Italic: *text* (but not **)
    if check_italic:
        italic_matches = re.finditer(r"(?<!\*)\*([^*\n]+?)\*(?!\*)", text)
        for m in italic_matches:
            # Skip if it's a list item marker
            if m.start() > 0 and text[m.start() - 1] == "\n":
                continue
            issues.append(
                {
                    "type": "italic",
                    "match": m.group(0)[:30],
                    "position": m.start(),
                    "context": text[max(0, m.start() - 10) : m.end() + 10],
                }
            )

    # Headers: # ## ### at line start
    if check_headers:
        header_matches = re.finditer(r"^#{1,6}\s+.+", text, re.MULTILINE)
        for m in header_matches:
            issues.append({"type": "header", "match": m.group(0)[:40], "position": m.start()})

    return issues


def check_gedankenstriche(text: str) -> list[dict]:
    """Check for en-dash and em-dash characters."""
    issues = []

    # En-dash: – (U+2013)
    for i, char in enumerate(text):
        if char == "–":
            context_start = max(0, i - 25)
            context_end = min(len(text), i + 25)
            issues.append(
                {
                    "type": "en_dash",
                    "char": "–",
                    "code": "U+2013",
                    "position": i,
                    "context": f"...{text[context_start:context_end]}...",
                }
            )

    # Em-dash: — (U+2014)
    for i, char in enumerate(text):
        if char == "—":
            context_start = max(0, i - 25)
            context_end = min(len(text), i + 25)
            issues.append(
                {
                    "type": "em_dash",
                    "char": "—",
                    "code": "U+2014",
                    "position": i,
                    "context": f"...{text[context_start:context_end]}...",
                }
            )

    return issues


def check_hashtags(text: str) -> list[dict]:
    """Check for hashtags in text."""
    issues = []

    hashtag_matches = re.finditer(r"#[A-Za-zÄÖÜäöüß]\w+", text)
    for m in hashtag_matches:
        issues.append({"type": "hashtag", "match": m.group(0), "position": m.start()})

    return issues


def check_ausrufezeichen(text: str, max_allowed: int = 2) -> list[dict]:
    """Check for excessive exclamation marks."""
    issues = []

    count = text.count("!")
    if count > max_allowed:
        # Find positions
        positions = [i for i, c in enumerate(text) if c == "!"]
        issues.append(
            {"type": "ausrufezeichen", "count": count, "max_allowed": max_allowed, "positions": positions[:5]}
        )

    return issues


def check_formatting(
    text: str,
    structure_config: dict | None = None,
    profile_config: dict | None = None,
    rules: dict | None = None,
) -> dict[str, Any]:
    """
    Run all formatting checks based on rules from structure and profile.

    Args:
        text: The content text to check
        structure_config: Structure configuration dict
        profile_config: Author profile configuration dict
        rules: Pre-extracted rules (optional, will be extracted if not provided)

    Returns:
        Dict with:
            - passed: bool
            - score: int (10 - number of issues, min 0)
            - issues: list of issue dicts
            - summary: str
    """
    # Extract rules if not provided
    if rules is None:
        rules = extract_rules(structure_config, profile_config)

    all_issues = []

    # Run checks based on rules
    if rules.get("emojis_verboten"):
        emoji_issues = check_emojis(text)
        all_issues.extend(emoji_issues)

    if rules.get("markdown_verboten") or rules.get("fettschrift_verboten"):
        md_issues = check_markdown(
            text,
            check_bold=rules.get("fettschrift_verboten", False) or rules.get("markdown_verboten", False),
            check_italic=rules.get("kursiv_verboten", False) or rules.get("markdown_verboten", False),
            check_headers=rules.get("header_verboten", False) or rules.get("markdown_verboten", False),
        )
        all_issues.extend(md_issues)

    if rules.get("gedankenstriche_verboten"):
        dash_issues = check_gedankenstriche(text)
        all_issues.extend(dash_issues)

    if rules.get("hashtags_verboten"):
        hashtag_issues = check_hashtags(text)
        all_issues.extend(hashtag_issues)

    if rules.get("ausrufezeichen_sparsam"):
        excl_issues = check_ausrufezeichen(text)
        all_issues.extend(excl_issues)

    # Calculate score
    issue_count = len(all_issues)
    score = max(0, 10 - issue_count)
    passed = issue_count == 0

    # Build summary
    if passed:
        summary = "Keine Formatierungsfehler gefunden."
    else:
        issue_types = list({i["type"] for i in all_issues})
        summary = f"{issue_count} Formatierungsfehler: {', '.join(issue_types)}"

    # Format issues for output
    formatted_issues = []
    for issue in all_issues:
        if issue["type"] in ["en_dash", "em_dash"]:
            formatted_issues.append(
                f"Gedankenstrich ({issue['code']}) an Position {issue['position']}: {issue.get('context', '')}"
            )
        elif issue["type"] in ["bold", "italic", "header"]:
            formatted_issues.append(f"Markdown ({issue['type']}): {issue.get('match', '')}")
        elif issue["type"] in ["emoji", "bullet_emoji"]:
            formatted_issues.append(f"Emoji gefunden: {issue['char']}")
        elif issue["type"] == "hashtag":
            formatted_issues.append(f"Hashtag gefunden: {issue['match']}")
        elif issue["type"] == "ausrufezeichen":
            formatted_issues.append(f"Zu viele Ausrufezeichen: {issue['count']} (max {issue['max_allowed']})")

    return {
        "passed": passed,
        "score": score,
        "issues": formatted_issues,
        "issues_detail": all_issues,
        "summary": summary,
        "rules_applied": rules,
    }


# Convenience function for direct testing
def check_order_formatting(order_id: int) -> dict[str, Any]:
    """
    Check formatting for a content order's latest version.

    Args:
        order_id: The content order ID

    Returns:
        Formatting check results
    """
    import json

    from db import db

    db.connect()

    try:
        # Get latest version content
        cursor = db.execute(
            """
            SELECT cv.content, co.structure_id, co.author_profile_id,
                   cs.content as structure_config, cp.content as profile_config
            FROM content_versions cv
            JOIN content_orders co ON cv.order_id = co.id
            LEFT JOIN content_config cs ON co.structure_id = cs.id
            LEFT JOIN content_config cp ON co.author_profile_id = cp.id
            WHERE cv.order_id = %s
            ORDER BY cv.version_number DESC LIMIT 1
            """,
            (order_id,),
        )
        row = cursor.fetchone()
        cursor.close()

        if not row:
            return {"error": f"Order {order_id} not found"}

        # Parse content
        content_data = json.loads(row["content"]) if isinstance(row["content"], str) else row["content"]
        text = content_data.get("text", "")

        # Parse configs
        structure_config = None
        if row.get("structure_config"):
            structure_config = (
                json.loads(row["structure_config"])
                if isinstance(row["structure_config"], str)
                else row["structure_config"]
            )

        profile_config = None
        if row.get("profile_config"):
            profile_config = (
                json.loads(row["profile_config"]) if isinstance(row["profile_config"], str) else row["profile_config"]
            )

        return check_formatting(text, structure_config, profile_config)

    finally:
        db.disconnect()
← Übersicht