persistence.py

Code Hygiene Score: 99

Issues 1

Zeile Typ Beschreibung
64 magic_number Magic Number gefunden: 100

Dependencies 4

Funktionen 4

Code

"""
Persistence Functions - Save content versions and sources to database.
"""

import json
import re
import sys

sys.path.insert(0, "/var/www/scripts/pipeline")

from db import db


def strip_markdown(text: str) -> str:
    """Remove markdown formatting from text for plain text output."""
    # Remove bold **text** and __text__
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
    text = re.sub(r"__(.+?)__", r"\1", text)
    # Remove italic *text* and _text_ (single)
    text = re.sub(r"(?<!\*)\*([^*]+?)\*(?!\*)", r"\1", text)
    text = re.sub(r"(?<!_)_([^_]+?)_(?!_)", r"\1", text)
    # Remove headers # ## ###
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
    # Remove inline code `text`
    text = re.sub(r"`([^`]+?)`", r"\1", text)
    # Replace Gedankenstriche (en-dash, em-dash) - typically used as parenthetical
    # " – " becomes ". " (new sentence) for cleaner flow
    text = text.replace(" – ", ". ")  # U+2013 en-dash with spaces
    text = text.replace(" — ", ". ")  # U+2014 em-dash with spaces
    # Standalone without spaces (rare)
    text = text.replace("–", "-")
    text = text.replace("—", "-")
    # Replace typographic quotation marks with straight quotes
    # German: „" (U+201E, U+201C) and ‚' (U+201A, U+2018)
    # English: "" (U+201C, U+201D) and '' (U+2018, U+2019)
    text = text.replace("\u201e", '"').replace("\u201c", '"').replace("\u201d", '"')
    text = text.replace("\u201a", "'").replace("\u2018", "'").replace("\u2019", "'")
    return text


def save_version(order_id: int, content: str, version_number: int = 1, output_format: str = "markdown") -> int:
    """Save content version to database."""
    # Strip markdown for plain text format
    if output_format == "reiner Text":
        content = strip_markdown(content)

    content_json = json.dumps({"text": content, "format": output_format})

    cursor = db.execute(
        """INSERT INTO content_versions (order_id, version_number, content)
           VALUES (%s, %s, %s)""",
        (order_id, version_number, content_json),
    )
    db.commit()
    version_id = cursor.lastrowid
    cursor.close()
    return version_id


def save_sources(order_id: int, context: list[dict]) -> None:
    """Save RAG sources to content_sources."""
    for ctx in context:
        # Try to find chunk_id by content match
        cursor = db.execute("SELECT id FROM chunks WHERE content LIKE %s LIMIT 1", (ctx["content"][:100] + "%",))
        chunk = cursor.fetchone()
        cursor.close()

        if chunk:
            cursor = db.execute(
                """INSERT IGNORE INTO content_sources (order_id, chunk_id, relevance_score)
                   VALUES (%s, %s, %s)""",
                (order_id, chunk["id"], ctx["score"]),
            )
            db.commit()
            cursor.close()


def update_order_status(order_id: int, status: str) -> None:
    """Update order status."""
    cursor = db.execute("UPDATE content_orders SET status = %s, updated_at = NOW() WHERE id = %s", (status, order_id))
    db.commit()
    cursor.close()
← Übersicht