{
"event": "PreToolUse",
"tool_name": "Write",
"tool_input": {
"file_path": "\/var\/www\/scripts\/pipeline\/generators\/format_checker.py",
"content": "\"\"\"\nDeterministic Format Checker - Code-based formatting validation.\n\nReplaces\/supplements LLM-based Formatierungsprüfer for reliable character-level checks.\nLLMs hallucinate on character detection; this module provides deterministic results.\n\"\"\"\n\nimport re\nimport sys\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\n# Common emoji ranges (simplified - covers most used emojis)\nEMOJI_PATTERN = re.compile(\n \"[\"\n \"\\U0001F300-\\U0001F9FF\" # Miscellaneous Symbols and Pictographs, Emoticons, etc.\n \"\\U00002702-\\U000027B0\" # Dingbats\n \"\\U0001F600-\\U0001F64F\" # Emoticons\n \"\\U0001F680-\\U0001F6FF\" # Transport and Map\n \"\\U00002600-\\U000026FF\" # Misc symbols\n \"\\U00002B50-\\U00002B55\" # Stars\n \"\\U000023E9-\\U000023F3\" # Media symbols\n \"\\U0000200D\" # Zero width joiner\n \"\\U0000FE0F\" # Variation selector\n \"]+\",\n flags=re.UNICODE,\n)\n\n# Special bullets that count as emojis in plain text context\nBULLET_EMOJIS = [\"▪️\", \"▫️\", \"◾\", \"◽\", \"●\", \"○\", \"◆\", \"◇\", \"★\", \"☆\", \"✓\", \"✗\", \"✔\", \"✘\", \"→\", \"➡\", \"➔\"]\n\n\ndef extract_rules(structure_config: dict | None, profile_config: dict | None) -> dict:\n \"\"\"\n Extract formatting rules from structure and profile configs.\n\n Args:\n structure_config: The structure configuration dict (from content_config.content)\n profile_config: The author profile configuration dict\n\n Returns:\n Dict with boolean flags for each rule type\n \"\"\"\n rules = {\n \"emojis_verboten\": False,\n \"markdown_verboten\": False,\n \"fettschrift_verboten\": False,\n \"kursiv_verboten\": False,\n \"header_verboten\": False,\n \"gedankenstriche_verboten\": False,\n \"hashtags_verboten\": False,\n \"ausrufezeichen_sparsam\": False,\n \"output_format\": \"markdown\",\n }\n\n # Extract from structure\n if structure_config:\n # Check ausgabe.format\n ausgabe = structure_config.get(\"ausgabe\", {})\n rules[\"output_format\"] = ausgabe.get(\"format\", \"markdown\")\n\n # If plain text, most formatting is implicitly forbidden\n if rules[\"output_format\"] == \"reiner Text\":\n rules[\"markdown_verboten\"] = True\n rules[\"fettschrift_verboten\"] = True\n rules[\"kursiv_verboten\"] = True\n rules[\"header_verboten\"] = True\n\n # Check formatierung section\n fmt = structure_config.get(\"formatierung\", {})\n if isinstance(fmt, dict):\n if \"verboten\" in str(fmt.get(\"emojis\", \"\")).lower():\n rules[\"emojis_verboten\"] = True\n if \"verboten\" in str(fmt.get(\"fettschrift\", \"\")).lower():\n rules[\"fettschrift_verboten\"] = True\n if \"verboten\" in str(fmt.get(\"markdown\", \"\")).lower():\n rules[\"markdown_verboten\"] = True\n rules[\"fettschrift_verboten\"] = True\n rules[\"kursiv_verboten\"] = True\n rules[\"header_verboten\"] = True\n if fmt.get(\"hashtags\") == \"keine\":\n rules[\"hashtags_verboten\"] = True\n\n # Check verboten array\n verboten = structure_config.get(\"verboten\", [])\n for v in verboten:\n v_lower = v.lower()\n if \"emoji\" in v_lower:\n rules[\"emojis_verboten\"] = True\n if \"fettschrift\" in v_lower or \"markdown\" in v_lower:\n rules[\"fettschrift_verboten\"] = True\n rules[\"markdown_verboten\"] = True\n if \"hashtag\" in v_lower:\n rules[\"hashtags_verboten\"] = True\n\n # Extract from profile\n if profile_config:\n # Check grammatik_und_satzbau section\n grammatik = profile_config.get(\"grammatik_und_satzbau\", {})\n if grammatik.get(\"gedankenstriche\") == \"verboten\":\n rules[\"gedankenstriche_verboten\"] = True\n\n # Check formatierung section in profile\n fmt = profile_config.get(\"formatierung\", {})\n if isinstance(fmt, dict):\n if \"verboten\" in str(fmt.get(\"emojis\", \"\")).lower():\n rules[\"emojis_verboten\"] = True\n if \"vermeiden\" in str(fmt.get(\"ausrufezeichen\", \"\")).lower():\n rules[\"ausrufezeichen_sparsam\"] = True\n\n return rules\n\n\ndef check_emojis(text: str) -> list[dict]:\n \"\"\"Check for emojis in text.\"\"\"\n issues = []\n\n # Check regex pattern\n matches = EMOJI_PATTERN.findall(text)\n if matches:\n for match in matches[:5]: # Limit to first 5\n pos = text.find(match)\n issues.append(\n {\"type\": \"emoji\", \"char\": match, \"position\": pos, \"context\": text[max(0, pos - 20) : pos + 20]}\n )\n\n # Check bullet emojis\n for bullet in BULLET_EMOJIS:\n if bullet in text:\n pos = text.find(bullet)\n issues.append(\n {\n \"type\": \"bullet_emoji\",\n \"char\": bullet,\n \"position\": pos,\n \"context\": text[max(0, pos - 20) : pos + 20],\n }\n )\n\n return issues\n\n\ndef check_markdown(text: str, check_bold: bool = True, check_italic: bool = True, check_headers: bool = True) -> list[dict]:\n \"\"\"Check for markdown formatting in text.\"\"\"\n issues = []\n\n # Bold: **text** or __text__\n if check_bold:\n bold_matches = re.finditer(r\"\\*\\*(.+?)\\*\\*\", text)\n for m in bold_matches:\n issues.append(\n {\n \"type\": \"bold\",\n \"match\": m.group(0)[:30],\n \"position\": m.start(),\n \"context\": text[max(0, m.start() - 10) : m.end() + 10],\n }\n )\n\n bold_matches2 = re.finditer(r\"__(.+?)__\", text)\n for m in bold_matches2:\n issues.append({\"type\": \"bold\", \"match\": m.group(0)[:30], \"position\": m.start()})\n\n # Italic: *text* (but not **)\n if check_italic:\n italic_matches = re.finditer(r\"(?<!\\*)\\*([^*\\n]+?)\\*(?!\\*)\", text)\n for m in italic_matches:\n # Skip if it's a list item marker\n if m.start() > 0 and text[m.start() - 1] == \"\\n\":\n continue\n issues.append(\n {\n \"type\": \"italic\",\n \"match\": m.group(0)[:30],\n \"position\": m.start(),\n \"context\": text[max(0, m.start() - 10) : m.end() + 10],\n }\n )\n\n # Headers: # ## ### at line start\n if check_headers:\n header_matches = re.finditer(r\"^#{1,6}\\s+.+\", text, re.MULTILINE)\n for m in header_matches:\n issues.append({\"type\": \"header\", \"match\": m.group(0)[:40], \"position\": m.start()})\n\n return issues\n\n\ndef check_gedankenstriche(text: str) -> list[dict]:\n \"\"\"Check for en-dash and em-dash characters.\"\"\"\n issues = []\n\n # En-dash: – (U+2013)\n for i, char in enumerate(text):\n if char == \"–\":\n context_start = max(0, i - 25)\n context_end = min(len(text), i + 25)\n issues.append(\n {\n \"type\": \"en_dash\",\n \"char\": \"–\",\n \"code\": \"U+2013\",\n \"position\": i,\n \"context\": f\"...{text[context_start:context_end]}...\",\n }\n )\n\n # Em-dash: — (U+2014)\n for i, char in enumerate(text):\n if char == \"—\":\n context_start = max(0, i - 25)\n context_end = min(len(text), i + 25)\n issues.append(\n {\n \"type\": \"em_dash\",\n \"char\": \"—\",\n \"code\": \"U+2014\",\n \"position\": i,\n \"context\": f\"...{text[context_start:context_end]}...\",\n }\n )\n\n return issues\n\n\ndef check_hashtags(text: str) -> list[dict]:\n \"\"\"Check for hashtags in text.\"\"\"\n issues = []\n\n hashtag_matches = re.finditer(r\"#[A-Za-zÄÖÜäöüß]\\w+\", text)\n for m in hashtag_matches:\n issues.append({\"type\": \"hashtag\", \"match\": m.group(0), \"position\": m.start()})\n\n return issues\n\n\ndef check_ausrufezeichen(text: str, max_allowed: int = 2) -> list[dict]:\n \"\"\"Check for excessive exclamation marks.\"\"\"\n issues = []\n\n count = text.count(\"!\")\n if count > max_allowed:\n # Find positions\n positions = [i for i, c in enumerate(text) if c == \"!\"]\n issues.append(\n {\"type\": \"ausrufezeichen\", \"count\": count, \"max_allowed\": max_allowed, \"positions\": positions[:5]}\n )\n\n return issues\n\n\ndef check_formatting(\n text: str,\n structure_config: dict | None = None,\n profile_config: dict | None = None,\n rules: dict | None = None,\n) -> dict[str, Any]:\n \"\"\"\n Run all formatting checks based on rules from structure and profile.\n\n Args:\n text: The content text to check\n structure_config: Structure configuration dict\n profile_config: Author profile configuration dict\n rules: Pre-extracted rules (optional, will be extracted if not provided)\n\n Returns:\n Dict with:\n - passed: bool\n - score: int (10 - number of issues, min 0)\n - issues: list of issue dicts\n - summary: str\n \"\"\"\n # Extract rules if not provided\n if rules is None:\n rules = extract_rules(structure_config, profile_config)\n\n all_issues = []\n\n # Run checks based on rules\n if rules.get(\"emojis_verboten\"):\n emoji_issues = check_emojis(text)\n all_issues.extend(emoji_issues)\n\n if rules.get(\"markdown_verboten\") or rules.get(\"fettschrift_verboten\"):\n md_issues = check_markdown(\n text,\n check_bold=rules.get(\"fettschrift_verboten\", False) or rules.get(\"markdown_verboten\", False),\n check_italic=rules.get(\"kursiv_verboten\", False) or rules.get(\"markdown_verboten\", False),\n check_headers=rules.get(\"header_verboten\", False) or rules.get(\"markdown_verboten\", False),\n )\n all_issues.extend(md_issu... [TRUNCATED-7e0310f4360e5c7e]"
}
}
{
"tool_response": {
"type": "create",
"filePath": "\/var\/www\/scripts\/pipeline\/generators\/format_checker.py",
"content": "\"\"\"\nDeterministic Format Checker - Code-based formatting validation.\n\nReplaces\/supplements LLM-based Formatierungsprüfer for reliable character-level checks.\nLLMs hallucinate on character detection; this module provides deterministic results.\n\"\"\"\n\nimport re\nimport sys\nfrom typing import Any\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\n\n# Common emoji ranges (simplified - covers most used emojis)\nEMOJI_PATTERN = re.compile(\n \"[\"\n \"\\U0001F300-\\U0001F9FF\" # Miscellaneous Symbols and Pictographs, Emoticons, etc.\n \"\\U00002702-\\U000027B0\" # Dingbats\n \"\\U0001F600-\\U0001F64F\" # Emoticons\n \"\\U0001F680-\\U0001F6FF\" # Transport and Map\n \"\\U00002600-\\U000026FF\" # Misc symbols\n \"\\U00002B50-\\U00002B55\" # Stars\n \"\\U000023E9-\\U000023F3\" # Media symbols\n \"\\U0000200D\" # Zero width joiner\n \"\\U0000FE0F\" # Variation selector\n \"]+\",\n flags=re.UNICODE,\n)\n\n# Special bullets that count as emojis in plain text context\nBULLET_EMOJIS = [\"▪️\", \"▫️\", \"◾\", \"◽\", \"●\", \"○\", \"◆\", \"◇\", \"★\", \"☆\", \"✓\", \"✗\", \"✔\", \"✘\", \"→\", \"➡\", \"➔\"]\n\n\ndef extract_rules(structure_config: dict | None, profile_config: dict | None) -> dict:\n \"\"\"\n Extract formatting rules from structure and profile configs.\n\n Args:\n structure_config: The structure configuration dict (from content_config.content)\n profile_config: The author profile configuration dict\n\n Returns:\n Dict with boolean flags for each rule type\n \"\"\"\n rules = {\n \"emojis_verboten\": False,\n \"markdown_verboten\": False,\n \"fettschrift_verboten\": False,\n \"kursiv_verboten\": False,\n \"header_verboten\": False,\n \"gedankenstriche_verboten\": False,\n \"hashtags_verboten\": False,\n \"ausrufezeichen_sparsam\": False,\n \"output_format\": \"markdown\",\n }\n\n # Extract from structure\n if structure_config:\n # Check ausgabe.format\n ausgabe = structure_config.get(\"ausgabe\", {})\n rules[\"output_format\"] = ausgabe.get(\"format\", \"markdown\")\n\n # If plain text, most formatting is implicitly forbidden\n if rules[\"output_format\"] == \"reiner Text\":\n rules[\"markdown_verboten\"] = True\n rules[\"fettschrift_verboten\"] = True\n rules[\"kursiv_verboten\"] = True\n rules[\"header_verboten\"] = True\n\n # Check formatierung section\n fmt = structure_config.get(\"formatierung\", {})\n if isinstance(fmt, dict):\n if \"verboten\" in str(fmt.get(\"emojis\", \"\")).lower():\n rules[\"emojis_verboten\"] = True\n if \"verboten\" in str(fmt.get(\"fettschrift\", \"\")).lower():\n rules[\"fettschrift_verboten\"] = True\n if \"verboten\" in str(fmt.get(\"markdown\", \"\")).lower():\n rules[\"markdown_verboten\"] = True\n rules[\"fettschrift_verboten\"] = True\n rules[\"kursiv_verboten\"] = True\n rules[\"header_verboten\"] = True\n if fmt.get(\"hashtags\") == \"keine\":\n rules[\"hashtags_verboten\"] = True\n\n # Check verboten array\n verboten = structure_config.get(\"verboten\", [])\n for v in verboten:\n v_lower = v.lower()\n if \"emoji\" in v_lower:\n rules[\"emojis_verboten\"] = True\n if \"fettschrift\" in v_lower or \"markdown\" in v_lower:\n rules[\"fettschrift_verboten\"] = True\n rules[\"markdown_verboten\"] = True\n if \"hashtag\" in v_lower:\n rules[\"hashtags_verboten\"] = True\n\n # Extract from profile\n if profile_config:\n # Check grammatik_und_satzbau section\n grammatik = profile_config.get(\"grammatik_und_satzbau\", {})\n if grammatik.get(\"gedankenstriche\") == \"verboten\":\n rules[\"gedankenstriche_verboten\"] = True\n\n # Check formatierung section in profile\n fmt = profile_config.get(\"formatierung\", {})\n if isinstance(fmt, dict):\n if \"verboten\" in str(fmt.get(\"emojis\", \"\")).lower():\n rules[\"emojis_verboten\"] = True\n if \"vermeiden\" in str(fmt.get(\"ausrufezeichen\", \"\")).lower():\n rules[\"ausrufezeichen_sparsam\"] = True\n\n return rules\n\n\ndef check_emojis(text: str) -> list[dict]:\n \"\"\"Check for emojis in text.\"\"\"\n issues = []\n\n # Check regex pattern\n matches = EMOJI_PATTERN.findall(text)\n if matches:\n for match in matches[:5]: # Limit to first 5\n pos = text.find(match)\n issues.append(\n {\"type\": \"emoji\", \"char\": match, \"position\": pos, \"context\": text[max(0, pos - 20) : pos + 20]}\n )\n\n # Check bullet emojis\n for bullet in BULLET_EMOJIS:\n if bullet in text:\n pos = text.find(bullet)\n issues.append(\n {\n \"type\": \"bullet_emoji\",\n \"char\": bullet,\n \"position\": pos,\n \"context\": text[max(0, pos - 20) : pos + 20],\n }\n )\n\n return issues\n\n\ndef check_markdown(text: str, check_bold: bool = True, check_italic: bool = True, check_headers: bool = True) -> list[dict]:\n \"\"\"Check for markdown formatting in text.\"\"\"\n issues = []\n\n # Bold: **text** or __text__\n if check_bold:\n bold_matches = re.finditer(r\"\\*\\*(.+?)\\*\\*\", text)\n for m in bold_matches:\n issues.append(\n {\n \"type\": \"bold\",\n \"match\": m.group(0)[:30],\n \"position\": m.start(),\n \"context\": text[max(0, m.start() - 10) : m.end() + 10],\n }\n )\n\n bold_matches2 = re.finditer(r\"__(.+?)__\", text)\n for m in bold_matches2:\n issues.append({\"type\": \"bold\", \"match\": m.group(0)[:30], \"position\": m.start()})\n\n # Italic: *text* (but not **)\n if check_italic:\n italic_matches = re.finditer(r\"(?<!\\*)\\*([^*\\n]+?)\\*(?!\\*)\", text)\n for m in italic_matches:\n # Skip if it's a list item marker\n if m.start() > 0 and text[m.start() - 1] == \"\\n\":\n continue\n issues.append(\n {\n \"type\": \"italic\",\n \"match\": m.group(0)[:30],\n \"position\": m.start(),\n \"context\": text[max(0, m.start() - 10) : m.end() + 10],\n }\n )\n\n # Headers: # ## ### at line start\n if check_headers:\n header_matches = re.finditer(r\"^#{1,6}\\s+.+\", text, re.MULTILINE)\n for m in header_matches:\n issues.append({\"type\": \"header\", \"match\": m.group(0)[:40], \"position\": m.start()})\n\n return issues\n\n\ndef check_gedankenstriche(text: str) -> list[dict]:\n \"\"\"Check for en-dash and em-dash characters.\"\"\"\n issues = []\n\n # En-dash: – (U+2013)\n for i, char in enumerate(text):\n if char == \"–\":\n context_start = max(0, i - 25)\n context_end = min(len(text), i + 25)\n issues.append(\n {\n \"type\": \"en_dash\",\n \"char\": \"–\",\n \"code\": \"U+2013\",\n \"position\": i,\n \"context\": f\"...{text[context_start:context_end]}...\",\n }\n )\n\n # Em-dash: — (U+2014)\n for i, char in enumerate(text):\n if char == \"—\":\n context_start = max(0, i - 25)\n context_end = min(len(text), i + 25)\n issues.append(\n {\n \"type\": \"em_dash\",\n \"char\": \"—\",\n \"code\": \"U+2014\",\n \"position\": i,\n \"context\": f\"...{text[context_start:context_end]}...\",\n }\n )\n\n return issues\n\n\ndef check_hashtags(text: str) -> list[dict]:\n \"\"\"Check for hashtags in text.\"\"\"\n issues = []\n\n hashtag_matches = re.finditer(r\"#[A-Za-zÄÖÜäöüß]\\w+\", text)\n for m in hashtag_matches:\n issues.append({\"type\": \"hashtag\", \"match\": m.group(0), \"position\": m.start()})\n\n return issues\n\n\ndef check_ausrufezeichen(text: str, max_allowed: int = 2) -> list[dict]:\n \"\"\"Check for excessive exclamation marks.\"\"\"\n issues = []\n\n count = text.count(\"!\")\n if count > max_allowed:\n # Find positions\n positions = [i for i, c in enumerate(text) if c == \"!\"]\n issues.append(\n {\"type\": \"ausrufezeichen\", \"count\": count, \"max_allowed\": max_allowed, \"positions\": positions[:5]}\n )\n\n return issues\n\n\ndef check_formatting(\n text: str,\n structure_config: dict | None = None,\n profile_config: dict | None = None,\n rules: dict | None = None,\n) -> dict[str, Any]:\n \"\"\"\n Run all formatting checks based on rules from structure and profile.\n\n Args:\n text: The content text to check\n structure_config: Structure configuration dict\n profile_config: Author profile configuration dict\n rules: Pre-extracted rules (optional, will be extracted if not provided)\n\n Returns:\n Dict with:\n - passed: bool\n - score: int (10 - number of issues, min 0)\n - issues: list of issue dicts\n - summary: str\n \"\"\"\n # Extract rules if not provided\n if rules is None:\n rules = extract_rules(structure_config, profile_config)\n\n all_issues = []\n\n # Run checks based on rules\n if rules.get(\"emojis_verboten\"):\n emoji_issues = check_emojis(text)\n all_issues.extend(emoji_issues)\n\n if rules.get(\"markdown_verboten\") or rules.get(\"fettschrift_verboten\"):\n md_issues = check_markdown(\n text,\n check_bold=rules.get(\"fettschrift_verboten\", False) or rules.get(\"markdown_verboten\", False),\n check_italic=rules.get(\"kursiv_verboten\", False) or rules.get(\"markdown_verboten\", False),\n check_headers=rules.get(\"header_verboten\", False) or rules.get(\"markdown_verboten\", False),\n )\n all_issues.extend(md_issu... [TRUNCATED-7e0310f4360e5c7e]",
"structuredPatch": [],
"originalFile": null
}
}