qdrantHost = CredentialService::getQdrantHost(); } /** Hybrid search combining semantic vectors with SQL filters. */ public function search(string $query, array $filters = [], int $limit = 10): array { // Stage 1: Semantic search in Qdrant $vectorResults = $this->semanticSearch($query, $filters, $limit * 3); if (empty($vectorResults)) { return []; } // Stage 2: Enrich with SQL data and apply filters $enrichedResults = $this->enrichAndFilter($vectorResults, $filters); // Stage 3: Re-rank based on combined score $rankedResults = $this->rerank($enrichedResults, $query); return array_slice($rankedResults, 0, $limit); } /** Searches within a specific taxonomy category. */ public function searchByCategory(string $query, string $category, int $limit = 10): array { return $this->search($query, ['taxonomy_category' => $category], $limit); } /** Searches for chunks containing a specific entity. */ public function searchByEntity(string $query, string $entityName, int $limit = 10): array { return $this->search($query, ['entity_name' => $entityName], $limit); } /** Searches for chunks with specific intent (explain, argue, define, etc.). */ public function searchByIntent(string $query, string $intent, int $limit = 10): array { return $this->search($query, ['intent' => $intent], $limit); } /** Searches for definition chunks only. */ public function searchDefinitions(string $query, int $limit = 10): array { return $this->search($query, ['discourse_role' => 'definition'], $limit); } /** Searches for evidence/example chunks for a topic. */ public function searchEvidence(string $query, int $limit = 10): array { return $this->search($query, ['discourse_role' => 'evidence'], $limit); } /** Gets all available taxonomy categories with counts. */ public function getTaxonomyCategories(): array { $stmt = $this->pdo->query(' SELECT taxonomy_category as category, COUNT(*) as count FROM dokumentation_chunks WHERE taxonomy_category IS NOT NULL GROUP BY taxonomy_category ORDER BY count DESC '); return $stmt->fetchAll(PDO::FETCH_ASSOC); } /** Gets all entities grouped by type. */ public function getEntitiesByType(): array { $stmt = $this->pdo->query(" SELECT entities FROM dokumentation_chunks WHERE entities IS NOT NULL AND entities != '[]' "); $byType = []; foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $row) { $entities = $this->decodeJsonArray($row['entities'] ?? null); foreach ($entities as $entity) { if (isset($entity['name'], $entity['type'])) { $type = $entity['type']; if (!isset($byType[$type])) { $byType[$type] = []; } if (!in_array($entity['name'], $byType[$type], true)) { $byType[$type][] = $entity['name']; } } } } return $byType; } /** Suggests related searches based on current results. */ public function suggestRelatedSearches(array $results): array { $suggestions = []; foreach ($results as $result) { // Add keywords from results foreach ($result['keywords'] ?? [] as $keyword) { if (!in_array($keyword, $suggestions, true)) { $suggestions[] = $keyword; } } // Add entity names foreach ($result['entities'] ?? [] as $entity) { if (isset($entity['name']) && !in_array($entity['name'], $suggestions, true)) { $suggestions[] = $entity['name']; } } } return array_slice($suggestions, 0, 5); } /** Performs semantic search in Qdrant. */ private function semanticSearch(string $query, array $filters, int $limit): array { $embedding = $this->ollama->getEmbedding($query); $url = sprintf('%s/collections/%s/points/search', $this->qdrantHost, self::COLLECTION); $payload = [ 'vector' => array_values($embedding), 'limit' => $limit, 'with_payload' => true, ]; // Add Qdrant filter if taxonomy category specified if (isset($filters['taxonomy_category'])) { $payload['filter'] = [ 'must' => [ [ 'key' => 'taxonomy_category', 'match' => ['value' => $filters['taxonomy_category']], ], ], ]; } try { $response = $this->makeRequest($url, $payload, 'POST'); if (!isset($response['result']) || !is_array($response['result'])) { return []; } return array_map(static function (array $item): array { return [ 'id' => (string) $item['id'], 'score' => (float) ($item['score'] ?? 0), 'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [], ]; }, $response['result']); } catch (RuntimeException) { return []; } } /** Enriches vector results with SQL data and applies filters. */ private function enrichAndFilter(array $vectorResults, array $filters): array { $results = []; $minScore = $filters['min_score'] ?? 0.3; foreach ($vectorResults as $vr) { if ($vr['score'] < $minScore) { continue; } $chunkId = (int) ($vr['payload']['chunk_id'] ?? 0); if ($chunkId === 0) { continue; } $chunk = $this->getChunkWithDocument($chunkId); if ($chunk === null) { continue; } // Apply entity/keyword filters if (!$this->matchesFilters($chunk, $filters)) { continue; } $results[] = [ 'chunk_id' => $chunkId, 'doc_id' => (int) $chunk['dokumentation_id'], 'path' => $chunk['doc_path'] ?? '', 'title' => $chunk['doc_title'] ?? '', 'content' => $chunk['content_clean'] ?? $chunk['content'] ?? '', 'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null), 'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null), 'entities' => $this->decodeJsonArray($chunk['entities'] ?? null), 'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null), // Semantic metadata 'summary' => $chunk['summary'] ?? null, 'sentiment' => $chunk['sentiment'] ?? 'neutral', 'intent' => $chunk['intent'] ?? null, 'discourse_role' => $chunk['discourse_role'] ?? null, 'score' => $vr['score'], 'relevance_score' => $vr['score'], ]; } return $results; } /** Checks if chunk matches entity/keyword filters. */ private function matchesFilters(array $chunk, array $filters): bool { if (isset($filters['entity_name'])) { $entities = $this->decodeJsonArray($chunk['entities'] ?? null); $found = false; foreach ($entities as $e) { if (isset($e['name']) && stripos($e['name'], $filters['entity_name']) !== false) { $found = true; break; } } if (!$found) { return false; } } if (isset($filters['entity_type'])) { $entities = $this->decodeJsonArray($chunk['entities'] ?? null); $found = false; foreach ($entities as $e) { if (isset($e['type']) && strtoupper($e['type']) === strtoupper($filters['entity_type'])) { $found = true; break; } } if (!$found) { return false; } } if (isset($filters['keyword'])) { $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null); $found = false; foreach ($keywords as $kw) { if (stripos($kw, $filters['keyword']) !== false) { $found = true; break; } } if (!$found) { return false; } } // Semantic filters if (isset($filters['intent']) && ($chunk['intent'] ?? null) !== $filters['intent']) { return false; } if (isset($filters['discourse_role']) && ($chunk['discourse_role'] ?? null) !== $filters['discourse_role']) { return false; } if (isset($filters['sentiment']) && ($chunk['sentiment'] ?? null) !== $filters['sentiment']) { return false; } return true; } /** Re-ranks results based on combined semantic and structural relevance. */ private function rerank(array $results, string $query): array { $queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []); $isDefinitionQuery = $this->isDefinitionQuery($query); foreach ($results as &$result) { $boost = 0.0; // Keyword matching boost foreach ($result['keywords'] as $kw) { foreach ($queryWords as $w) { if (stripos($kw, $w) !== false) { $boost += 0.05; } } } // Entity matching boost foreach ($result['entities'] as $e) { if (isset($e['name'])) { foreach ($queryWords as $w) { if (stripos($e['name'], $w) !== false) { $boost += 0.03; } } } } // Title matching boost foreach ($queryWords as $w) { if (stripos($result['title'], $w) !== false) { $boost += 0.1; } } // Semantic boost based on discourse role $discourseRole = $result['discourse_role'] ?? null; if ($isDefinitionQuery && $discourseRole === 'definition') { $boost += 0.15; // Strong boost for definitions when asking "was ist" } elseif ($discourseRole === 'thesis') { $boost += 0.08; // Thesis statements are valuable } elseif ($discourseRole === 'evidence') { $boost += 0.05; // Evidence supports claims } // Intent boost - explanations are generally more useful $intent = $result['intent'] ?? null; if ($intent === 'explain' || $intent === 'define') { $boost += 0.05; } $result['relevance_score'] = min(1.0, $result['score'] + $boost); } usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']); return $results; } /** Detects if query is asking for a definition. */ private function isDefinitionQuery(string $query): bool { $patterns = [ '/^was\s+ist\b/i', '/^was\s+sind\b/i', '/^was\s+bedeutet\b/i', '/^definition\b/i', '/^erkl[äa]r/i', '/^beschreib/i', ]; foreach ($patterns as $pattern) { if (preg_match($pattern, $query)) { return true; } } return false; } /** Gets chunk with document data. */ private function getChunkWithDocument(int $chunkId): ?array { $stmt = $this->pdo->prepare(' SELECT c.*, d.title as doc_title, d.path as doc_path FROM dokumentation_chunks c JOIN dokumentation d ON c.dokumentation_id = d.id WHERE c.id = :id '); $stmt->execute(['id' => $chunkId]); $result = $stmt->fetch(PDO::FETCH_ASSOC); return $result !== false ? $result : null; } /** Makes an HTTP request to Qdrant. */ private function makeRequest(string $url, array $payload, string $method): array { $ch = curl_init($url); if ($ch === false) { throw new RuntimeException('Failed to initialize cURL'); } $jsonPayload = json_encode($payload); if ($jsonPayload === false) { curl_close($ch); throw new RuntimeException('Failed to encode JSON payload'); } curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => self::TIMEOUT, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_CUSTOMREQUEST => $method, CURLOPT_POSTFIELDS => $jsonPayload, CURLOPT_HTTPHEADER => [ 'Content-Type: application/json', 'Content-Length: ' . strlen($jsonPayload), ], ]); $result = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $curlError = curl_error($ch); curl_close($ch); if ($result === false) { throw new RuntimeException(sprintf('cURL request failed: %s', $curlError ?: 'Unknown error')); } if ($httpCode >= 400) { throw new RuntimeException(sprintf('Qdrant API returned HTTP %d', $httpCode)); } $decoded = json_decode((string) $result, true); return is_array($decoded) ? $decoded : []; } }