{
"event": "PreToolUse",
"tool_name": "Bash",
"tool_input": {
"command": ".\/venv\/bin\/python -c \"\nfrom db import db\n\ndb.connect()\ncursor = db.execute('SELECT content FROM chunks WHERE id = 12')\nchunk = cursor.fetchone()\ncursor.close()\n\ncontent = chunk['content']\nprint(f'Char length: {len(content)}')\nprint(f'Byte length: {len(content.encode(\\\"utf-8\\\"))}')\nprint(f'Word count: {len(content.split())}')\n\n# Check for special chars\nimport unicodedata\nspecial = [c for c in content if not c.isascii() and not c.isspace()]\nprint(f'Non-ASCII chars: {len(special)}')\nprint(f'Sample: {special[:20]}')\n\n# Try with cleaned version\nimport re\ncleaned = re.sub(r'[^\\x00-\\x7F]+', ' ', content)\nprint(f'\\\\nCleaned length: {len(cleaned)}')\n\nimport requests\nresponse = requests.post(\n 'http:\/\/localhost:11434\/api\/embeddings',\n json={'model': 'mxbai-embed-large', 'prompt': cleaned},\n timeout=60\n)\nprint(f'Cleaned embed status: {response.status_code}')\n\ndb.disconnect()\n\"",
"description": "Check special characters"
}
}
{
"tool_response": {
"stdout": "Char length: 958\nByte length: 971\nWord count: 68\nNon-ASCII chars: 12\nSample: ['Ü', '–', '\\xad', '\\xad', '\\xad', 'ö', 'ä', 'ü', '\\xad', '\\xad', 'ä', 'ä']\n\nCleaned length: 958\nCleaned embed status: 500",
"stderr": "",
"interrupted": false,
"isImage": false
}
}