{
"event": "PreToolUse",
"tool_name": "Edit",
"tool_input": {
"file_path": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/import_leipzig_corpus.py",
"old_string": " \" ON DUPLICATE KEY UPDATE frequency = VALUES(frequency)\"",
"new_string": " \" ON DUPLICATE KEY UPDATE frequency = frequency + VALUES(frequency)\"",
"replace_all": false
}
}
{
"tool_response": {
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/crawler\/scripts\/import_leipzig_corpus.py",
"oldString": " \" ON DUPLICATE KEY UPDATE frequency = VALUES(frequency)\"",
"newString": " \" ON DUPLICATE KEY UPDATE frequency = frequency + VALUES(frequency)\"",
"originalFile": "#!\/usr\/bin\/env python3\n\"\"\"Imports Leipzig Wortschatz word frequency list into reference_corpus_de.\n\nInput format: rank<TAB>word<TAB>frequency\nFilter: alphabetic tokens (äöüß+a-z) with 3..80 lowercase chars.\nBatch insert via executemany.\n\nUsage: python scripts\/import_leipzig_corpus.py <words_file> [corpus_name]\n\"\"\"\n\nfrom __future__ import annotations\n\nimport pathlib\nimport re\nimport sys\n\nsys.path.insert(0, \"\/var\/www\/scripts\/pipeline\")\nimport mysql.connector # noqa: E402\nfrom config import DB_CONFIG # noqa: E402\n\n_TOKEN_RE = re.compile(r\"^[a-zäöüß]{3,80}$\")\n_BATCH = 5000\n_INSERT = (\n \"INSERT INTO reference_corpus_de (corpus, token, frequency) VALUES (%s, %s, %s)\"\n \" ON DUPLICATE KEY UPDATE frequency = VALUES(frequency)\"\n)\n\n\ndef main() -> int:\n if len(sys.argv) < 2:\n print(\"usage: import_leipzig_corpus.py <words_file> [corpus_name]\", file=sys.stderr)\n return 2\n path = pathlib.Path(sys.argv[1])\n corpus = sys.argv[2] if len(sys.argv) > 2 else \"leipzig_news_2024_1M\"\n\n conn = mysql.connector.connect(\n host=DB_CONFIG[\"host\"], user=DB_CONFIG[\"user\"],\n password=DB_CONFIG[\"password\"], database=\"crawler\",\n )\n cur = conn.cursor()\n batch: list[tuple[str, str, int]] = []\n total = 0\n accepted = 0\n with path.open(encoding=\"utf-8\") as f:\n for line in f:\n total += 1\n parts = line.rstrip(\"\\n\").split(\"\\t\")\n if len(parts) != 3:\n continue\n word = parts[1].strip().lower()\n if not _TOKEN_RE.match(word):\n continue\n try:\n freq = int(parts[2])\n except ValueError:\n continue\n batch.append((corpus, word, freq))\n accepted += 1\n if len(batch) >= _BATCH:\n cur.executemany(_INSERT, batch)\n conn.commit()\n batch.clear()\n if batch:\n cur.executemany(_INSERT, batch)\n conn.commit()\n conn.close()\n print(f\"OK corpus={corpus} lines={total} accepted={accepted}\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n",
"structuredPatch": [
{
"oldStart": 22,
"oldLines": 7,
"newStart": 22,
"newLines": 7,
"lines": [
" _BATCH = 5000",
" _INSERT = (",
" \"INSERT INTO reference_corpus_de (corpus, token, frequency) VALUES (%s, %s, %s)\"",
"- \" ON DUPLICATE KEY UPDATE frequency = VALUES(frequency)\"",
"+ \" ON DUPLICATE KEY UPDATE frequency = frequency + VALUES(frequency)\"",
" )",
" ",
" "
]
}
],
"userModified": false,
"replaceAll": false
}
}