persistence.py
- Pfad:
/var/www/scripts/pipeline/generators/persistence.py
- Namespace: pipeline
- Zeilen: 83 | Größe: 3,072 Bytes
- Geändert: 2025-12-31 03:01:09 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 99
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 90 (10%)
Issues 1
| Zeile |
Typ |
Beschreibung |
| 64 |
magic_number |
Magic Number gefunden: 100 |
Dependencies 4
- use json
- use re
- use sys
- use db.db
Funktionen 4
-
strip_markdown()
Zeile 14
-
save_version()
Zeile 41
-
save_sources()
Zeile 60
-
update_order_status()
Zeile 78
Code
"""
Persistence Functions - Save content versions and sources to database.
"""
import json
import re
import sys
sys.path.insert(0, "/var/www/scripts/pipeline")
from db import db
def strip_markdown(text: str) -> str:
"""Remove markdown formatting from text for plain text output."""
# Remove bold **text** and __text__
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
text = re.sub(r"__(.+?)__", r"\1", text)
# Remove italic *text* and _text_ (single)
text = re.sub(r"(?<!\*)\*([^*]+?)\*(?!\*)", r"\1", text)
text = re.sub(r"(?<!_)_([^_]+?)_(?!_)", r"\1", text)
# Remove headers # ## ###
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
# Remove inline code `text`
text = re.sub(r"`([^`]+?)`", r"\1", text)
# Replace Gedankenstriche (en-dash, em-dash) - typically used as parenthetical
# " – " becomes ". " (new sentence) for cleaner flow
text = text.replace(" – ", ". ") # U+2013 en-dash with spaces
text = text.replace(" — ", ". ") # U+2014 em-dash with spaces
# Standalone without spaces (rare)
text = text.replace("–", "-")
text = text.replace("—", "-")
# Replace typographic quotation marks with straight quotes
# German: „" (U+201E, U+201C) and ‚' (U+201A, U+2018)
# English: "" (U+201C, U+201D) and '' (U+2018, U+2019)
text = text.replace("\u201e", '"').replace("\u201c", '"').replace("\u201d", '"')
text = text.replace("\u201a", "'").replace("\u2018", "'").replace("\u2019", "'")
return text
def save_version(order_id: int, content: str, version_number: int = 1, output_format: str = "markdown") -> int:
"""Save content version to database."""
# Strip markdown for plain text format
if output_format == "reiner Text":
content = strip_markdown(content)
content_json = json.dumps({"text": content, "format": output_format})
cursor = db.execute(
"""INSERT INTO content_versions (order_id, version_number, content)
VALUES (%s, %s, %s)""",
(order_id, version_number, content_json),
)
db.commit()
version_id = cursor.lastrowid
cursor.close()
return version_id
def save_sources(order_id: int, context: list[dict]) -> None:
"""Save RAG sources to content_sources."""
for ctx in context:
# Try to find chunk_id by content match
cursor = db.execute("SELECT id FROM chunks WHERE content LIKE %s LIMIT 1", (ctx["content"][:100] + "%",))
chunk = cursor.fetchone()
cursor.close()
if chunk:
cursor = db.execute(
"""INSERT IGNORE INTO content_sources (order_id, chunk_id, relevance_score)
VALUES (%s, %s, %s)""",
(order_id, chunk["id"], ctx["score"]),
)
db.commit()
cursor.close()
def update_order_status(order_id: int, status: str) -> None:
"""Update order status."""
cursor = db.execute("UPDATE content_orders SET status = %s, updated_at = NOW() WHERE id = %s", (status, order_id))
db.commit()
cursor.close()