{
"tool_response": {
"type": "update",
"filePath": "\/var\/www\/dev.campus.systemische-tools.de\/src\/Infrastructure\/Docs\/ChunkSyncService.php",
"content": "<?php\n\ndeclare(strict_types=1);\n\nnamespace Infrastructure\\Docs;\n\n\/\/ @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant\n\nuse Infrastructure\\AI\\OllamaService;\nuse Infrastructure\\AI\\QdrantClient;\nuse Infrastructure\\Traits\\JsonDecodeTrait;\nuse PDO;\n\nfinal class ChunkSyncService\n{\n use JsonDecodeTrait;\n\n private const string COLLECTION = 'dokumentation_chunks';\n private const int VECTOR_SIZE = 1024;\n private const int BATCH_SIZE = 10;\n\n public function __construct(\n private PDO $pdo,\n private OllamaService $ollama,\n private QdrantClient $qdrant\n ) {\n }\n\n \/**\n * Ensures the Qdrant collection exists with proper configuration.\n *\/\n public function ensureCollection(): bool\n {\n return $this->qdrant->ensureCollection(self::COLLECTION, self::VECTOR_SIZE);\n }\n\n \/**\n * Syncs a single chunk to Qdrant.\n *\/\n public function syncChunk(int $chunkId): bool\n {\n $chunk = $this->getChunk($chunkId);\n\n if ($chunk === null) {\n return false;\n }\n\n if ($chunk['analysis_status'] !== 'completed') {\n return false;\n }\n\n $doc = $this->getDocument((int) $chunk['dokumentation_id']);\n $text = $this->prepareTextForEmbedding($chunk, $doc);\n $embedding = $this->ollama->getEmbedding($text);\n $payload = $this->buildPayload($chunk, $doc);\n $qdrantId = $chunk['qdrant_id'] ?? $this->qdrant->generateUuid();\n\n $success = $this->qdrant->upsertPoint(self::COLLECTION, $qdrantId, $embedding, $payload);\n\n if ($success && $chunk['qdrant_id'] === null) {\n $this->updateQdrantId($chunkId, $qdrantId);\n }\n\n return $success;\n }\n\n \/**\n * Syncs all analyzed chunks that haven't been synced yet.\n *\n * @return array{synced: int, failed: int, errors: array<string>}\n *\/\n public function syncAllPending(int $limit = 100): array\n {\n $this->ensureCollection();\n\n $results = ['synced' => 0, 'failed' => 0, 'errors' => []];\n $chunks = $this->getUnsyncedChunks($limit);\n\n foreach ($chunks as $chunk) {\n try {\n if ($this->syncChunk((int) $chunk['id'])) {\n $results['synced']++;\n\n if ($results['synced'] % self::BATCH_SIZE === 0) {\n echo \"Synced {$results['synced']} chunks...\\n\";\n }\n } else {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: Sync failed\";\n }\n } catch (\\RuntimeException $e) {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();\n }\n }\n\n return $results;\n }\n\n \/**\n * Syncs all chunks (re-sync).\n *\n * @return array{synced: int, failed: int, errors: array<string>}\n *\/\n public function syncAll(): array\n {\n $this->ensureCollection();\n\n $results = ['synced' => 0, 'failed' => 0, 'errors' => []];\n $chunks = $this->getAllAnalyzedChunks();\n\n foreach ($chunks as $chunk) {\n try {\n if ($this->syncChunk((int) $chunk['id'])) {\n $results['synced']++;\n\n if ($results['synced'] % self::BATCH_SIZE === 0) {\n echo \"Synced {$results['synced']} chunks...\\n\";\n }\n } else {\n $results['failed']++;\n }\n } catch (\\RuntimeException $e) {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();\n }\n }\n\n return $results;\n }\n\n \/**\n * Gets collection statistics.\n *\n * @return array{points_count: int, status: string}|null\n *\/\n public function getStats(): ?array\n {\n return $this->qdrant->getCollectionStats(self::COLLECTION);\n }\n\n \/**\n * Prepares text for embedding.\n *\n * @param array<string, mixed> $chunk\n * @param array<string, mixed> $doc\n *\/\n private function prepareTextForEmbedding(array $chunk, array $doc): string\n {\n $parts = [];\n\n $parts[] = 'Dokument: ' . ($doc['title'] ?? '');\n\n $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);\n if (!empty($headingPath)) {\n $parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);\n }\n\n $taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);\n if (!empty($taxonomy)) {\n $parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);\n }\n\n $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);\n if (!empty($keywords)) {\n $parts[] = 'Keywords: ' . implode(', ', $keywords);\n }\n\n $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';\n $content = $this->sanitizeForEmbedding($content);\n if (mb_strlen($content) > 1000) {\n $content = mb_substr($content, 0, 1000) . '...';\n }\n $parts[] = 'Inhalt: ' . $content;\n\n $text = implode(\"\\n\\n\", $parts);\n\n if (mb_strlen($text) > 1800) {\n $text = mb_substr($text, 0, 1800) . '...';\n }\n\n return $text;\n }\n\n \/**\n * Sanitizes text for embedding by removing problematic characters.\n *\/\n private function sanitizeForEmbedding(string $text): string\n {\n $text = preg_replace('\/[\\x{2500}-\\x{257F}]\/u', ' ', $text) ?? $text;\n $text = preg_replace('\/[\\x{2580}-\\x{259F}]\/u', ' ', $text) ?? $text;\n $text = preg_replace('\/[\\x{25A0}-\\x{25FF}]\/u', ' ', $text) ?? $text;\n $text = preg_replace('\/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]\/u', '', $text) ?? $text;\n $text = preg_replace('\/[ \\t]+\/', ' ', $text) ?? $text;\n $text = preg_replace('\/\\n{3,}\/', \"\\n\\n\", $text) ?? $text;\n $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');\n\n return trim($text);\n }\n\n \/**\n * Builds the Qdrant payload.\n *\n * @param array<string, mixed> $chunk\n * @param array<string, mixed> $doc\n * @return array<string, mixed>\n *\/\n private function buildPayload(array $chunk, array $doc): array\n {\n $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';\n $content = $this->sanitizeForEmbedding($content);\n $preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;\n\n return [\n 'chunk_id' => (int) $chunk['id'],\n 'doc_id' => (int) $chunk['dokumentation_id'],\n 'chunk_index' => (int) $chunk['chunk_index'],\n 'path' => $doc['path'] ?? '',\n 'title' => $doc['title'] ?? '',\n 'content_preview' => $preview,\n 'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),\n 'taxonomy_category' => $chunk['taxonomy_category'] ?? null,\n 'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),\n 'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),\n 'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),\n 'token_count' => (int) ($chunk['token_count'] ?? 0),\n ];\n }\n\n \/**\n * Updates the qdrant_id in the database.\n *\/\n private function updateQdrantId(int $chunkId, string $qdrantId): void\n {\n $stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');\n $stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);\n }\n\n \/**\n * Gets a chunk by ID.\n *\n * @return array<string, mixed>|null\n *\/\n private function getChunk(int $id): ?array\n {\n $stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');\n $stmt->execute(['id' => $id]);\n $result = $stmt->fetch(PDO::FETCH_ASSOC);\n\n return $result !== false ? $result : null;\n }\n\n \/**\n * Gets a document by ID.\n *\n * @return array<string, mixed>\n *\/\n private function getDocument(int $id): array\n {\n $stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');\n $stmt->execute(['id' => $id]);\n $result = $stmt->fetch(PDO::FETCH_ASSOC);\n\n return $result !== false ? $result : [];\n }\n\n \/**\n * Gets unsynced chunks (analyzed but not in Qdrant).\n *\n * @return array<array<string, mixed>>\n *\/\n private function getUnsyncedChunks(int $limit): array\n {\n $stmt = $this->pdo->prepare(\"\n SELECT * FROM dokumentation_chunks\n WHERE analysis_status = 'completed' AND qdrant_id IS NULL\n ORDER BY dokumentation_id, chunk_index\n LIMIT :limit\n \");\n $stmt->bindValue('limit', $limit, PDO::PARAM_INT);\n $stmt->execute();\n\n return $stmt->fetchAll(PDO::FETCH_ASSOC);\n }\n\n \/**\n * Gets all analyzed chunks.\n *\n * @return array<array<string, mixed>>\n *\/\n private function getAllAnalyzedChunks(): array\n {\n $stmt = $this->pdo->query(\"\n SELECT * FROM dokumentation_chunks\n WHERE analysis_status = 'completed'\n ORDER BY dokumentation_id, chunk_index\n \");\n\n return $stmt->fetchAll(PDO::FETCH_ASSOC);\n }\n}\n",
"structuredPatch": [
{
"oldStart": 7,
"oldLines": 10,
"newStart": 7,
"newLines": 9,
"lines": [
" \/\/ @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant",
" ",
" use Infrastructure\\AI\\OllamaService;",
"-use Infrastructure\\Config\\CredentialService;",
"+use Infrastructure\\AI\\QdrantClient;",
" use Infrastructure\\Traits\\JsonDecodeTrait;",
" use PDO;",
"-use RuntimeException;",
" ",
" final class ChunkSyncService",
" {"
]
},
{
"oldStart": 18,
"oldLines": 16,
"newStart": 17,
"newLines": 13,
"lines": [
" ",
" private const string COLLECTION = 'dokumentation_chunks';",
" private const int VECTOR_SIZE = 1024;",
"- private const int TIMEOUT = 60;",
" private const int BATCH_SIZE = 10;",
" ",
"- private string $qdrantHost;",
"-",
" public function __construct(",
" private PDO $pdo,",
"- private OllamaService $ollama",
"+ private OllamaService $ollama,",
"+ private QdrantClient $qdrant",
" ) {",
"- $this->qdrantHost = CredentialService::getQdrantHost();",
" }",
" ",
" \/**"
]
},
{
"oldStart": 35,
"oldLines": 33,
"newStart": 31,
"newLines": 7,
"lines": [
" *\/",
" public function ensureCollection(): bool",
" {",
"- \/\/ Check if collection exists",
"- $url = sprintf('%s\/collections\/%s', $this->qdrantHost, self::COLLECTION);",
"-",
"- try {",
"- $response = $this->makeRequest($url, [], 'GET');",
"- if (isset($response['result'])) {",
"- return true; \/\/ Collection exists",
"- }",
"- } catch (RuntimeException) {",
"- \/\/ Collection doesn't exist, create it",
"- }",
"-",
"- \/\/ Create collection",
"- $payload = [",
"- 'vectors' => [",
"- 'size' => self::VECTOR_SIZE,",
"- 'distance' => 'Cosine',",
"- ],",
"- ];",
"-",
"- try {",
"- $this->makeRequest($url, $payload, 'PUT');",
"-",
"- return true;",
"- } catch (RuntimeException $e) {",
"- throw new RuntimeException('Failed to create collection: ' . $e->getMessage());",
"- }",
"+ return $this->qdrant->ensureCollection(self::COLLECTION, self::VECTOR_SIZE);",
" }",
" ",
" \/**"
]
},
{
"oldStart": 75,
"oldLines": 29,
"newStart": 45,
"newLines": 18,
"lines": [
" return false;",
" }",
" ",
"- \/\/ Only sync completed analyses",
" if ($chunk['analysis_status'] !== 'completed') {",
" return false;",
" }",
" ",
"- \/\/ Get document context",
" $doc = $this->getDocument((int) $chunk['dokumentation_id']);",
"-",
"- \/\/ Prepare text for embedding",
" $text = $this->prepareTextForEmbedding($chunk, $doc);",
"-",
"- \/\/ Generate embedding",
" $embedding = $this->ollama->getEmbedding($text);",
"-",
"- \/\/ Build payload with metadata",
" $payload = $this->buildPayload($chunk, $doc);",
"+ $qdrantId = $chunk['qdrant_id'] ?? $this->qdrant->generateUuid();",
" ",
"- \/\/ Generate UUID for Qdrant if not exists",
"- $qdrantId = $chunk['qdrant_id'] ?? $this->generateUuid();",
"+ $success = $this->qdrant->upsertPoint(self::COLLECTION, $qdrantId, $embedding, $payload);",
" ",
"- \/\/ Upsert to Qdrant",
"- $success = $this->upsertPoint($qdrantId, $embedding, $payload);",
"-",
" if ($success && $chunk['qdrant_id'] === null) {",
" $this->updateQdrantId($chunkId, $qdrantId);",
" }"
]
},
{
"oldStart": 115,
"oldLines": 7,
"newStart": 74,
"newLines": 6,
"lines": [
" $this->ensureCollection();",
" ",
" $results = ['synced' => 0, 'failed' => 0, 'errors' => []];",
"-",
" $chunks = $this->getUnsyncedChunks($limit);",
" ",
" foreach ($chunks as $chunk) {"
]
},
{
"oldStart": 130,
"oldLines": 7,
"newStart": 88,
"newLines": 7,
"lines": [
" $results['failed']++;",
" $results['errors'][] = \"Chunk #{$chunk['id']}: Sync failed\";",
" }",
"- } catch (RuntimeException $e) {",
"+ } catch (\\RuntimeException $e) {",
" $results['failed']++;",
" $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();",
" }"
]
},
{
"oldStart": 149,
"oldLines": 7,
"newStart": 107,
"newLines": 6,
"lines": [
" $this->ensureCollection();",
" ",
" $results = ['synced' => 0, 'failed' => 0, 'errors' => []];",
"-",
" $chunks = $this->getAllAnalyzedChunks();",
" ",
" foreach ($chunks as $chunk) {"
]
},
{
"oldStart": 163,
"oldLines": 7,
"newStart": 120,
"newLines": 7,
"lines": [
" } else {",
" $results['failed']++;",
" }",
"- } catch (RuntimeException $e) {",
"+ } catch (\\RuntimeException $e) {",
" $results['failed']++;",
" $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();",
" }"
]
},
{
"oldStart": 173,
"oldLines": 91,
"newStart": 130,
"newLines": 13,
"lines": [
" }",
" ",
" \/**",
"- * Searches for similar chunks using semantic search.",
"- *",
"- * @param array<string, mixed>|null $filter Optional filter for taxonomy\/entities",
"- * @return array<array{id: int, doc_id: int, path: string, title: string, content: string, score: float, taxonomy: array<string>, entities: array<mixed>}>",
"- *\/",
"- public function search(string $query, int $limit = 5, ?array $filter = null): array",
"- {",
"- $embedding = $this->ollama->getEmbedding($query);",
"-",
"- $url = sprintf('%s\/collections\/%s\/points\/search', $this->qdrantHost, self::COLLECTION);",
"-",
"- $payload = [",
"- 'vector' => array_values($embedding),",
"- 'limit' => $limit,",
"- 'with_payload' => true,",
"- ];",
"-",
"- if ($filter !== null) {",
"- $payload['filter'] = $filter;",
"- }",
"-",
"- $response = $this->makeRequest($url, $payload, 'POST');",
"-",
"- if (!isset($response['result']) || !is_array($response['result'])) {",
"- return [];",
"- }",
"-",
"- return array_map(static function (array $item): array {",
"- $payload = $item['payload'] ?? [];",
"-",
"- return [",
"- 'id' => (int) ($payload['chunk_id'] ?? 0),",
"- 'doc_id' => (int) ($payload['doc_id'] ?? 0),",
"- 'path' => (string) ($payload['path'] ?? ''),",
"- 'title' => (string) ($payload['title'] ?? ''),",
"- 'content' => (string) ($payload['content_preview'] ?? ''),",
"- 'score' => (float) ($item['score'] ?? 0),",
"- 'taxonomy' => is_array($payload['taxonomy'] ?? null) ? $payload['taxonomy'] : [],",
"- 'entities' => is_array($payload['entities'] ?? null) ? $payload['entities'] : [],",
"- ];",
"- }, $response['result']);",
"- }",
"-",
"- \/**",
"- * Searches with taxonomy filter.",
"- *",
"- * @return array<array<string, mixed>>",
"- *\/",
"- public function searchByTaxonomy(string $query, string $category, int $limit = 5): array",
"- {",
"- $filter = [",
"- 'must' => [",
"- [",
"- 'key' => 'taxonomy_category',",
"- 'match' => ['value' => $category],",
"- ],",
"- ],",
"- ];",
"-",
"- return $this->search($query, $limit, $filter);",
"- }",
"-",
"- \/**",
" * Gets collection statistics.",
" *",
" * @return array{points_count: int, status: string}|null",
" *\/",
" public function getStats(): ?array",
" {",
"- $url = sprintf('%s\/collections\/%s', $this->qdrantHost, self::COLLECTION);",
"-",
"- try {",
"- $response = $this->makeRequest($url, [], 'GET');",
"-",
"- if (!isset($response['result'])) {",
"- return null;",
"- }",
"-",
"- return [",
"- 'points_count' => (int) ($response['result']['points_count'] ?? 0),",
"- 'status' => (string) ($response['result']['status'] ?? 'unknown'),",
"- ];",
"- } catch (RuntimeException) {",
"- return null;",
"- }",
"+ return $this->qdrant->getCollectionStats(self::COLLECTION);",
" }",
" ",
" \/**"
]
},
{
"oldStart": 270,
"oldLines": 28,
"newStart": 149,
"newLines": 23,
"lines": [
" {",
" $parts = [];",
" ",
"- \/\/ Document context",
" $parts[] = 'Dokument: ' . ($doc['title'] ?? '');",
" ",
"- \/\/ Heading path",
" $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);",
" if (!empty($headingPath)) {",
" $parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);",
" }",
" ",
"- \/\/ Taxonomy",
" $taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);",
" if (!empty($taxonomy)) {",
" $parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);",
" }",
" ",
"- \/\/ Keywords",
" $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);",
" if (!empty($keywords)) {",
" $parts[] = 'Keywords: ' . implode(', ', $keywords);",
" }",
" ",
"- \/\/ Main content - sanitize and limit",
" $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';",
" $content = $this->sanitizeForEmbedding($content);",
" if (mb_strlen($content) > 1000) {"
]
},
{
"oldStart": 301,
"oldLines": 7,
"newStart": 175,
"newLines": 6,
"lines": [
" ",
" $text = implode(\"\\n\\n\", $parts);",
" ",
"- \/\/ Final safety limit for embedding model context",
" if (mb_strlen($text) > 1800) {",
" $text = mb_substr($text, 0, 1800) . '...';",
" }"
]
},
{
"oldStart": 314,
"oldLines": 19,
"newStart": 187,
"newLines": 12,
"lines": [
" *\/",
" private function sanitizeForEmbedding(string $text): string",
" {",
"- \/\/ Remove box-drawing and other problematic Unicode characters",
"- $text = preg_replace('\/[\\x{2500}-\\x{257F}]\/u', ' ', $text) ?? $text; \/\/ Box Drawing",
"- $text = preg_replace('\/[\\x{2580}-\\x{259F}]\/u', ' ', $text) ?? $text; \/\/ Block Elements",
"- $text = preg_replace('\/[\\x{25A0}-\\x{25FF}]\/u', ' ', $text) ?? $text; \/\/ Geometric Shapes",
"-",
"- \/\/ Remove control characters except newlines and tabs",
"+ $text = preg_replace('\/[\\x{2500}-\\x{257F}]\/u', ' ', $text) ?? $text;",
"+ $text = preg_replace('\/[\\x{2580}-\\x{259F}]\/u', ' ', $text) ?? $text;",
"+ $text = preg_replace('\/[\\x{25A0}-\\x{25FF}]\/u', ' ', $text) ?? $text;",
" $text = preg_replace('\/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]\/u', '', $text) ?? $text;",
"-",
"- \/\/ Normalize whitespace",
" $text = preg_replace('\/[ \\t]+\/', ' ', $text) ?? $text;",
" $text = preg_replace('\/\\n{3,}\/', \"\\n\\n\", $text) ?? $text;",
"-",
"- \/\/ Ensure valid UTF-8",
" $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');",
" ",
" return trim($text);"
]
},
{
"oldStart": 362,
"oldLines": 35,
"newStart": 228,
"newLines": 6,
"lines": [
" }",
" ",
" \/**",
"- * Upserts a point to Qdrant.",
"- *",
"- * @param array<int, float> $vector",
"- * @param array<string, mixed> $payload",
"- *\/",
"- private function upsertPoint(string $id, array $vector, array $payload): bool",
"- {",
"- $url = sprintf('%s\/collections\/%s\/points', $this->qdrantHost, self::COLLECTION);",
"-",
"- $data = [",
"- 'points' => [",
"- [",
"- 'id' => $id,",
"- 'vector' => array_values($vector),",
"- 'payload' => $payload,",
"- ],",
"- ],",
"- ];",
"-",
"- try {",
"- $this->makeRequest($url, $data, 'PUT');",
"-",
"- return true;",
"- } catch (RuntimeException) {",
"- return false;",
"- }",
"- }",
"-",
"- \/**",
" * Updates the qdrant_id in the database.",
" *\/",
" private function updateQdrantId(int $chunkId, string $qdrantId): void"
]
},
{
"oldStart": 461,
"oldLines": 80,
"newStart": 298,
"newLines": 4,
"lines": [
" ",
" return $stmt->fetchAll(PDO::FETCH_ASSOC);",
" }",
"-",
"- \/**",
"- * Generates a UUID v4.",
"- *\/",
"- private function generateUuid(): string",
"- {",
"- $data = random_bytes(16);",
"- $data[6] = chr((ord($data[6]) & 0x0f) | 0x40);",
"- $data[8] = chr((ord($data[8]) & 0x3f) | 0x80);",
"-",
"- return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($data), 4));",
"- }",
"-",
"- \/**",
"- * Makes an HTTP request to Qdrant.",
"- *",
"- * @param array<string, mixed> $payload",
"- * @return array<string, mixed>",
"- *\/",
"- private function makeRequest(string $url, array $payload, string $method): array",
"- {",
"- $ch = curl_init($url);",
"-",
"- if ($ch === false) {",
"- throw new RuntimeException('Failed to initialize cURL');",
"- }",
"-",
"- $headers = ['Content-Type: application\/json'];",
"-",
"- if ($method === 'GET') {",
"- curl_setopt_array($ch, [",
"- CURLOPT_RETURNTRANSFER => true,",
"- CURLOPT_TIMEOUT => self::TIMEOUT,",
"- CURLOPT_CONNECTTIMEOUT => 10,",
"- CURLOPT_HTTPHEADER => $headers,",
"- CURLOPT_CUSTOMREQUEST => 'GET',",
"- ]);",
"- } else {",
"- $jsonPayload = json_encode($payload);",
"-",
"- if ($jsonPayload === false) {",
"- curl_close($ch);",
"-",
"- throw new RuntimeException('Failed to encode JSON payload');",
"- }",
"-",
"- $headers[] = 'Content-Length: ' . strlen($jsonPayload);",
"-",
"- curl_setopt_array($ch, [",
"- CURLOPT_RETURNTRANSFER => true,",
"- CURLOPT_TIMEOUT => self::TIMEOUT,",
"- CURLOPT_CONNECTTIMEOUT => 10,",
"- CURLOPT_HTTPHEADER => $headers,",
"- CURLOPT_CUSTOMREQUEST => $method,",
"- CURLOPT_POSTFIELDS => $jsonPayload,",
"- ]);",
"- }",
"-",
"- $result = curl_exec($ch);",
"- $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);",
"- $curlError = curl_error($ch);",
"-",
"- curl_close($ch);",
"-",
"- if ($result === false) {",
"- throw new RuntimeException(sprintf('cURL request failed: %s', $curlError ?: 'Unknown error'));",
"- }",
"-",
"- if ($httpCode >= 400) {",
"- throw new RuntimeException(sprintf('Qdrant API returned HTTP %d: %s', $httpCode, $result));",
"- }",
"-",
"- $decoded = json_decode((string) $result, true);",
"-",
"- return is_array($decoded) ? $decoded : [];",
"- }",
" }"
]
}
],
"originalFile": "<?php\n\ndeclare(strict_types=1);\n\nnamespace Infrastructure\\Docs;\n\n\/\/ @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant\n\nuse Infrastructure\\AI\\OllamaService;\nuse Infrastructure\\Config\\CredentialService;\nuse Infrastructure\\Traits\\JsonDecodeTrait;\nuse PDO;\nuse RuntimeException;\n\nfinal class ChunkSyncService\n{\n use JsonDecodeTrait;\n\n private const string COLLECTION = 'dokumentation_chunks';\n private const int VECTOR_SIZE = 1024;\n private const int TIMEOUT = 60;\n private const int BATCH_SIZE = 10;\n\n private string $qdrantHost;\n\n public function __construct(\n private PDO $pdo,\n private OllamaService $ollama\n ) {\n $this->qdrantHost = CredentialService::getQdrantHost();\n }\n\n \/**\n * Ensures the Qdrant collection exists with proper configuration.\n *\/\n public function ensureCollection(): bool\n {\n \/\/ Check if collection exists\n $url = sprintf('%s\/collections\/%s', $this->qdrantHost, self::COLLECTION);\n\n try {\n $response = $this->makeRequest($url, [], 'GET');\n if (isset($response['result'])) {\n return true; \/\/ Collection exists\n }\n } catch (RuntimeException) {\n \/\/ Collection doesn't exist, create it\n }\n\n \/\/ Create collection\n $payload = [\n 'vectors' => [\n 'size' => self::VECTOR_SIZE,\n 'distance' => 'Cosine',\n ],\n ];\n\n try {\n $this->makeRequest($url, $payload, 'PUT');\n\n return true;\n } catch (RuntimeException $e) {\n throw new RuntimeException('Failed to create collection: ' . $e->getMessage());\n }\n }\n\n \/**\n * Syncs a single chunk to Qdrant.\n *\/\n public function syncChunk(int $chunkId): bool\n {\n $chunk = $this->getChunk($chunkId);\n\n if ($chunk === null) {\n return false;\n }\n\n \/\/ Only sync completed analyses\n if ($chunk['analysis_status'] !== 'completed') {\n return false;\n }\n\n \/\/ Get document context\n $doc = $this->getDocument((int) $chunk['dokumentation_id']);\n\n \/\/ Prepare text for embedding\n $text = $this->prepareTextForEmbedding($chunk, $doc);\n\n \/\/ Generate embedding\n $embedding = $this->ollama->getEmbedding($text);\n\n \/\/ Build payload with metadata\n $payload = $this->buildPayload($chunk, $doc);\n\n \/\/ Generate UUID for Qdrant if not exists\n $qdrantId = $chunk['qdrant_id'] ?? $this->generateUuid();\n\n \/\/ Upsert to Qdrant\n $success = $this->upsertPoint($qdrantId, $embedding, $payload);\n\n if ($success && $chunk['qdrant_id'] === null) {\n $this->updateQdrantId($chunkId, $qdrantId);\n }\n\n return $success;\n }\n\n \/**\n * Syncs all analyzed chunks that haven't been synced yet.\n *\n * @return array{synced: int, failed: int, errors: array<string>}\n *\/\n public function syncAllPending(int $limit = 100): array\n {\n $this->ensureCollection();\n\n $results = ['synced' => 0, 'failed' => 0, 'errors' => []];\n\n $chunks = $this->getUnsyncedChunks($limit);\n\n foreach ($chunks as $chunk) {\n try {\n if ($this->syncChunk((int) $chunk['id'])) {\n $results['synced']++;\n\n if ($results['synced'] % self::BATCH_SIZE === 0) {\n echo \"Synced {$results['synced']} chunks...\\n\";\n }\n } else {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: Sync failed\";\n }\n } catch (RuntimeException $e) {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();\n }\n }\n\n return $results;\n }\n\n \/**\n * Syncs all chunks (re-sync).\n *\n * @return array{synced: int, failed: int, errors: array<string>}\n *\/\n public function syncAll(): array\n {\n $this->ensureCollection();\n\n $results = ['synced' => 0, 'failed' => 0, 'errors' => []];\n\n $chunks = $this->getAllAnalyzedChunks();\n\n foreach ($chunks as $chunk) {\n try {\n if ($this->syncChunk((int) $chunk['id'])) {\n $results['synced']++;\n\n if ($results['synced'] % self::BATCH_SIZE === 0) {\n echo \"Synced {$results['synced']} chunks...\\n\";\n }\n } else {\n $results['failed']++;\n }\n } catch (RuntimeException $e) {\n $results['failed']++;\n $results['errors'][] = \"Chunk #{$chunk['id']}: \" . $e->getMessage();\n }\n }\n\n return $results;\n }\n\n \/**\n * Searches for similar chunks using semantic search.\n *\n * @param array<string, mixed>|null $filter Optional filter for taxonomy\/entities\n * @return array<array{id: int, doc_id: int, path: string, title: string, content: string, score: float, taxonomy: array<string>, entities: array<mixed>}>\n *\/\n public function search(string $query, int $limit = 5, ?array $filter = null): array\n {\n $embedding = $this->ollama->getEmbedding($query);\n\n $url = sprintf('%s\/collections\/%s\/points\/search', $this->qdrantHost, self::COLLECTION);\n\n $payload = [\n 'vector' => array_values($embedding),\n 'limit' => $limit,\n 'with_payload' => true,\n ];\n\n if ($filter !== null) {\n $payload['filter'] = $filter;\n }\n\n $response = $this->makeRequest($url, $payload, 'POST');\n\n if (!isset($response['result']) || !is_array($response['result'])) {\n return [];\n }\n\n return array_map(static function (array $item): array {\n $payload = $item['payload'] ?? [];\n\n return [\n 'id' => (int) ($payload['chunk_id'] ?? 0),\n 'doc_id' => (int) ($payload['doc_id'] ?? 0),\n 'path' => (string) ($payload['path'] ?? ''),\n 'title' => (string) ($payload['title'] ?? ''),\n 'content' => (string) ($payload['content_preview'] ?? ''),\n 'score' => (float) ($item['score'] ?? 0),\n 'taxonomy' => is_array($payload['taxonomy'] ?? null) ? $payload['taxonomy'] : [],\n 'entities' => is_array($payload['entities'] ?? null) ? $payload['entities'] : [],\n ];\n }, $response['result']);\n }\n\n \/**\n * Searches with taxonomy filter.\n *\n * @return array<array<string, mixed>>\n *\/\n public function searchByTaxonomy(string $query, string $category, int $limit = 5): array\n {\n $filter = [\n 'must' => [\n [\n 'key' => 'taxonomy_category',\n 'match' => ['value' => $category],\n ],\n ],\n ];\n\n return $this->search($query, $limit, $filter);\n }\n\n \/**\n * Gets collection statistics.\n *\n * @return array{points_count: int, status: string}|null\n *\/\n public function getStats(): ?array\n {\n $url = sprintf('%s\/collections\/%s', $this->qdrantHost, self::COLLECTION);\n\n try {\n $response = $this->makeRequest($url, [], 'GET');\n\n if (!isset($response['result'])) {\n return null;\n }\n\n return [\n 'points_count' => (int) ($response['result']['points_count'] ?? 0),\n 'status' => (string) ($response['result']['status'] ?? 'unknown'),\n ];\n } catch (RuntimeException) {\n return null;\n }\n }\n\n \/**\n * Prepares text for embedding.\n *\n * @param array<string, mixed> $chunk\n * @param array<string, mixed> $doc\n *\/\n private function prepareTextForEmbedding(array $chunk, array $doc): string\n {\n $parts = [];\n\n \/\/ Document context\n $parts[] = 'Dokument: ' . ($doc['title'] ?? '');\n\n \/\/ Heading path\n $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);\n if (!empty($headingPath)) {\n $parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);\n }\n\n \/\/ Taxonomy\n $taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);\n if (!empty($taxonomy)) {\n $parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);\n }\n\n \/\/ Keywords\n $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);\n if (!empty($keywords)) {\n $parts[] = 'Keywords: ' . implode(', ', $keywords);\n }\n\n \/\/ Main content - sanitize and limit\n $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';\n $content = $this->sanitizeForEmbedding($content);\n if (mb_strlen($content) > 1000) {\n $content = mb_substr($content, 0, 1000) . '...';\n }\n $parts[] = 'Inhalt: ' . $content;\n\n $text = implode(\"\\n\\n\", $parts);\n\n \/\/ Final safety limit for embedding model context\n if (mb_strlen($text) > 1800) {\n $text = mb_substr($text, 0, 1800) . '...';\n }\n\n return $text;\n }\n\n \/**\n * Sanitizes text for embedding by removing problematic characters.\n *\/\n private function sanitizeForEmbedding(string $text): string\n {\n \/\/ Remove box-drawing and other problematic Unicode characters\n $text = preg_replace('\/[\\x{2500}-\\x{257F}]\/u', ' ', $text) ?? $text; \/\/ Box Drawing\n $text = preg_replace('\/[\\x{2580}-\\x{259F}]\/u', ' ', $text) ?? $text; \/\/ Block Elements\n $text = preg_replace('\/[\\x{25A0}-\\x{25FF}]\/u', ' ', $text) ?? $text; \/\/ Geometric Shapes\n\n \/\/ Remove control characters except newlines and tabs\n $text = preg_replace('\/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]\/u', '', $text) ?? $text;\n\n ... [TRUNCATED-13e08f81e4baea49]"
}
}