ollama = new OllamaService(); $this->pdo = $this->createConnection(); } /** * Performs a hybrid search combining semantic and structured filtering. * * @param string $query The search query * @param array{ * taxonomy_category?: string, * taxonomy_path?: array, * entity_type?: string, * entity_name?: string, * keyword?: string, * min_score?: float * } $filters Optional structured filters * @param int $limit Maximum results * @return array, * taxonomy: array, * entities: array, * keywords: array, * score: float, * relevance_score: float * }> */ public function search(string $query, array $filters = [], int $limit = 10): array { // Stage 1: Semantic search in Qdrant $vectorResults = $this->semanticSearch($query, $filters, $limit * 3); if (empty($vectorResults)) { return []; } // Stage 2: Enrich with SQL data and apply filters $enrichedResults = $this->enrichAndFilter($vectorResults, $filters); // Stage 3: Re-rank based on combined score $rankedResults = $this->rerank($enrichedResults, $query); return array_slice($rankedResults, 0, $limit); } /** * Searches within a specific taxonomy category. * * @return array> */ public function searchByCategory(string $query, string $category, int $limit = 10): array { return $this->search($query, ['taxonomy_category' => $category], $limit); } /** * Searches for chunks containing a specific entity. * * @return array> */ public function searchByEntity(string $query, string $entityName, int $limit = 10): array { return $this->search($query, ['entity_name' => $entityName], $limit); } /** * Gets all available taxonomy categories. * * @return array */ public function getTaxonomyCategories(): array { $stmt = $this->pdo->query(' SELECT taxonomy_category as category, COUNT(*) as count FROM dokumentation_chunks WHERE taxonomy_category IS NOT NULL GROUP BY taxonomy_category ORDER BY count DESC '); return $stmt->fetchAll(PDO::FETCH_ASSOC); } /** * Gets all entities grouped by type. * * @return array> */ public function getEntitiesByType(): array { $stmt = $this->pdo->query(" SELECT entities FROM dokumentation_chunks WHERE entities IS NOT NULL AND entities != '[]' "); $byType = []; foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $row) { $entities = json_decode($row['entities'], true) ?: []; foreach ($entities as $entity) { if (isset($entity['name'], $entity['type'])) { $type = $entity['type']; if (!isset($byType[$type])) { $byType[$type] = []; } if (!in_array($entity['name'], $byType[$type], true)) { $byType[$type][] = $entity['name']; } } } } return $byType; } /** * Suggests related searches based on current results. * * @param array> $results * @return array */ public function suggestRelatedSearches(array $results): array { $suggestions = []; foreach ($results as $result) { // Add keywords from results foreach ($result['keywords'] ?? [] as $keyword) { if (!in_array($keyword, $suggestions, true)) { $suggestions[] = $keyword; } } // Add entity names foreach ($result['entities'] ?? [] as $entity) { if (isset($entity['name']) && !in_array($entity['name'], $suggestions, true)) { $suggestions[] = $entity['name']; } } } return array_slice($suggestions, 0, 5); } /** * Performs semantic search in Qdrant. * * @param array $filters * @return array}> */ private function semanticSearch(string $query, array $filters, int $limit): array { $embedding = $this->ollama->getEmbedding($query); $url = sprintf('%s/collections/%s/points/search', self::QDRANT_HOST, self::COLLECTION); $payload = [ 'vector' => array_values($embedding), 'limit' => $limit, 'with_payload' => true, ]; // Add Qdrant filter if taxonomy category specified if (isset($filters['taxonomy_category'])) { $payload['filter'] = [ 'must' => [ [ 'key' => 'taxonomy_category', 'match' => ['value' => $filters['taxonomy_category']], ], ], ]; } try { $response = $this->makeRequest($url, $payload, 'POST'); if (!isset($response['result']) || !is_array($response['result'])) { return []; } return array_map(static function (array $item): array { return [ 'id' => (string) $item['id'], 'score' => (float) ($item['score'] ?? 0), 'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [], ]; }, $response['result']); } catch (RuntimeException) { return []; } } /** * Enriches vector results with SQL data and applies additional filters. * * @param array}> $vectorResults * @param array $filters * @return array> */ private function enrichAndFilter(array $vectorResults, array $filters): array { $results = []; $minScore = $filters['min_score'] ?? 0.3; foreach ($vectorResults as $vr) { // Apply minimum score filter if ($vr['score'] < $minScore) { continue; } $chunkId = (int) ($vr['payload']['chunk_id'] ?? 0); if ($chunkId === 0) { continue; } // Get full chunk data from DB $chunk = $this->getChunkWithDocument($chunkId); if ($chunk === null) { continue; } // Apply entity filter if (isset($filters['entity_name'])) { $entities = json_decode($chunk['entities'] ?? '[]', true) ?: []; $found = false; foreach ($entities as $entity) { if (isset($entity['name']) && stripos($entity['name'], $filters['entity_name']) !== false) { $found = true; break; } } if (!$found) { continue; } } // Apply entity type filter if (isset($filters['entity_type'])) { $entities = json_decode($chunk['entities'] ?? '[]', true) ?: []; $found = false; foreach ($entities as $entity) { if (isset($entity['type']) && strtoupper($entity['type']) === strtoupper($filters['entity_type'])) { $found = true; break; } } if (!$found) { continue; } } // Apply keyword filter if (isset($filters['keyword'])) { $keywords = json_decode($chunk['keywords'] ?? '[]', true) ?: []; $found = false; foreach ($keywords as $kw) { if (stripos($kw, $filters['keyword']) !== false) { $found = true; break; } } if (!$found) { continue; } } $results[] = [ 'chunk_id' => $chunkId, 'doc_id' => (int) $chunk['dokumentation_id'], 'path' => $chunk['doc_path'] ?? '', 'title' => $chunk['doc_title'] ?? '', 'content' => $chunk['content_clean'] ?? $chunk['content'] ?? '', 'heading_path' => json_decode($chunk['heading_path'] ?? '[]', true) ?: [], 'taxonomy' => json_decode($chunk['taxonomy_path'] ?? '[]', true) ?: [], 'entities' => json_decode($chunk['entities'] ?? '[]', true) ?: [], 'keywords' => json_decode($chunk['keywords'] ?? '[]', true) ?: [], 'score' => $vr['score'], 'relevance_score' => $vr['score'], // Will be adjusted in rerank ]; } return $results; } /** * Re-ranks results based on combined semantic and structural relevance. * * @param array> $results * @return array> */ private function rerank(array $results, string $query): array { $queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []); foreach ($results as &$result) { $boost = 0.0; // Boost for keyword matches foreach ($result['keywords'] as $keyword) { foreach ($queryWords as $word) { if (stripos($keyword, $word) !== false) { $boost += 0.05; } } } // Boost for entity matches foreach ($result['entities'] as $entity) { if (isset($entity['name'])) { foreach ($queryWords as $word) { if (stripos($entity['name'], $word) !== false) { $boost += 0.03; } } } } // Boost for title matches foreach ($queryWords as $word) { if (stripos($result['title'], $word) !== false) { $boost += 0.1; } } $result['relevance_score'] = min(1.0, $result['score'] + $boost); } // Sort by relevance score usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']); return $results; } /** * Gets chunk with document data. * * @return array|null */ private function getChunkWithDocument(int $chunkId): ?array { $stmt = $this->pdo->prepare(' SELECT c.*, d.title as doc_title, d.path as doc_path FROM dokumentation_chunks c JOIN dokumentation d ON c.dokumentation_id = d.id WHERE c.id = :id '); $stmt->execute(['id' => $chunkId]); $result = $stmt->fetch(PDO::FETCH_ASSOC); return $result !== false ? $result : null; } /** * Makes an HTTP request to Qdrant. * * @param array $payload * @return array */ private function makeRequest(string $url, array $payload, string $method): array { $ch = curl_init($url); if ($ch === false) { throw new RuntimeException('Failed to initialize cURL'); } $jsonPayload = json_encode($payload); if ($jsonPayload === false) { curl_close($ch); throw new RuntimeException('Failed to encode JSON payload'); } curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => self::TIMEOUT, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_CUSTOMREQUEST => $method, CURLOPT_POSTFIELDS => $jsonPayload, CURLOPT_HTTPHEADER => [ 'Content-Type: application/json', 'Content-Length: ' . strlen($jsonPayload), ], ]); $result = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $curlError = curl_error($ch); curl_close($ch); if ($result === false) { throw new RuntimeException(sprintf('cURL request failed: %s', $curlError ?: 'Unknown error')); } if ($httpCode >= 400) { throw new RuntimeException(sprintf('Qdrant API returned HTTP %d', $httpCode)); } $decoded = json_decode((string) $result, true); return is_array($decoded) ? $decoded : []; } private function createConnection(): PDO { $password = $this->getPassword(); return new PDO( 'mysql:host=localhost;dbname=ki_content;charset=utf8mb4', 'root', $password, [ PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION, PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, ] ); } private function getPassword(): string { $file = '/var/www/docs/credentials/credentials.md'; $content = file_get_contents($file); if ($content === false) { return ''; } foreach (explode("\n", $content) as $line) { if (str_contains($line, 'MariaDB') && str_contains($line, 'root')) { $parts = explode('|', $line); if (count($parts) >= 4) { return trim($parts[3]); } } } return ''; } }