Backup #329
| ID | 329 |
| Dateipfad | /var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/HybridSearchService.php |
| Version | 3 |
| Typ |
modified |
| Größe | 13.9 KB |
| Hash | b5de67c7a35c64f722dddaa8f1ee2f0a286b02391e54281b679aca6fed607158 |
| Datum | 2025-12-22 08:09:29 |
| Geändert von | claude-code-hook |
| Grund | Claude Code Pre-Hook Backup vor Edit-Operation |
| Datei existiert |
Ja
|
Dateiinhalt
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
use Infrastructure\AI\OllamaService;
use PDO;
use RuntimeException;
/**
* Hybrid Search Service combining vector search with SQL filtering.
*
* Implements a two-stage search:
* 1. Semantic search via Qdrant (vector similarity)
* 2. Structured filtering via MariaDB (taxonomy, entities, keywords)
*
* This approach achieves 70-85% precision vs 30-40% for pure vector search.
*/
final class HybridSearchService
{
private const string QDRANT_HOST = 'http://localhost:6333';
private const string COLLECTION = 'dokumentation_chunks';
private const int TIMEOUT = 30;
private PDO $pdo;
private OllamaService $ollama;
public function __construct()
{
$this->ollama = new OllamaService();
$this->pdo = $this->createConnection();
}
/**
* Performs a hybrid search combining semantic and structured filtering.
*
* @param string $query The search query
* @param array{
* taxonomy_category?: string,
* taxonomy_path?: array<string>,
* entity_type?: string,
* entity_name?: string,
* keyword?: string,
* min_score?: float
* } $filters Optional structured filters
* @param int $limit Maximum results
* @return array<array{
* chunk_id: int,
* doc_id: int,
* path: string,
* title: string,
* content: string,
* heading_path: array<string>,
* taxonomy: array<string>,
* entities: array<mixed>,
* keywords: array<string>,
* score: float,
* relevance_score: float
* }>
*/
public function search(string $query, array $filters = [], int $limit = 10): array
{
// Stage 1: Semantic search in Qdrant
$vectorResults = $this->semanticSearch($query, $filters, $limit * 3);
if (empty($vectorResults)) {
return [];
}
// Stage 2: Enrich with SQL data and apply filters
$enrichedResults = $this->enrichAndFilter($vectorResults, $filters);
// Stage 3: Re-rank based on combined score
$rankedResults = $this->rerank($enrichedResults, $query);
return array_slice($rankedResults, 0, $limit);
}
/**
* Searches within a specific taxonomy category.
*
* @return array<array<string, mixed>>
*/
public function searchByCategory(string $query, string $category, int $limit = 10): array
{
return $this->search($query, ['taxonomy_category' => $category], $limit);
}
/**
* Searches for chunks containing a specific entity.
*
* @return array<array<string, mixed>>
*/
public function searchByEntity(string $query, string $entityName, int $limit = 10): array
{
return $this->search($query, ['entity_name' => $entityName], $limit);
}
/**
* Gets all available taxonomy categories.
*
* @return array<array{category: string, count: int}>
*/
public function getTaxonomyCategories(): array
{
$stmt = $this->pdo->query('
SELECT taxonomy_category as category, COUNT(*) as count
FROM dokumentation_chunks
WHERE taxonomy_category IS NOT NULL
GROUP BY taxonomy_category
ORDER BY count DESC
');
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Gets all entities grouped by type.
*
* @return array<string, array<string>>
*/
public function getEntitiesByType(): array
{
$stmt = $this->pdo->query("
SELECT entities FROM dokumentation_chunks
WHERE entities IS NOT NULL AND entities != '[]'
");
$byType = [];
foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $row) {
$entities = json_decode($row['entities'], true) ?: [];
foreach ($entities as $entity) {
if (isset($entity['name'], $entity['type'])) {
$type = $entity['type'];
if (!isset($byType[$type])) {
$byType[$type] = [];
}
if (!in_array($entity['name'], $byType[$type], true)) {
$byType[$type][] = $entity['name'];
}
}
}
}
return $byType;
}
/**
* Suggests related searches based on current results.
*
* @param array<array<string, mixed>> $results
* @return array<string>
*/
public function suggestRelatedSearches(array $results): array
{
$suggestions = [];
foreach ($results as $result) {
// Add keywords from results
foreach ($result['keywords'] ?? [] as $keyword) {
if (!in_array($keyword, $suggestions, true)) {
$suggestions[] = $keyword;
}
}
// Add entity names
foreach ($result['entities'] ?? [] as $entity) {
if (isset($entity['name']) && !in_array($entity['name'], $suggestions, true)) {
$suggestions[] = $entity['name'];
}
}
}
return array_slice($suggestions, 0, 5);
}
/**
* Performs semantic search in Qdrant.
*
* @param array<string, mixed> $filters
* @return array<array{id: string, score: float, payload: array<string, mixed>}>
*/
private function semanticSearch(string $query, array $filters, int $limit): array
{
$embedding = $this->ollama->getEmbedding($query);
$url = sprintf('%s/collections/%s/points/search', self::QDRANT_HOST, self::COLLECTION);
$payload = [
'vector' => array_values($embedding),
'limit' => $limit,
'with_payload' => true,
];
// Add Qdrant filter if taxonomy category specified
if (isset($filters['taxonomy_category'])) {
$payload['filter'] = [
'must' => [
[
'key' => 'taxonomy_category',
'match' => ['value' => $filters['taxonomy_category']],
],
],
];
}
try {
$response = $this->makeRequest($url, $payload, 'POST');
if (!isset($response['result']) || !is_array($response['result'])) {
return [];
}
return array_map(static function (array $item): array {
return [
'id' => (string) $item['id'],
'score' => (float) ($item['score'] ?? 0),
'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [],
];
}, $response['result']);
} catch (RuntimeException) {
return [];
}
}
/**
* Enriches vector results with SQL data and applies additional filters.
*
* @param array<array{id: string, score: float, payload: array<string, mixed>}> $vectorResults
* @param array<string, mixed> $filters
* @return array<array<string, mixed>>
*/
private function enrichAndFilter(array $vectorResults, array $filters): array
{
$results = [];
$minScore = $filters['min_score'] ?? 0.3;
foreach ($vectorResults as $vr) {
// Apply minimum score filter
if ($vr['score'] < $minScore) {
continue;
}
$chunkId = (int) ($vr['payload']['chunk_id'] ?? 0);
if ($chunkId === 0) {
continue;
}
// Get full chunk data from DB
$chunk = $this->getChunkWithDocument($chunkId);
if ($chunk === null) {
continue;
}
// Apply entity filter
if (isset($filters['entity_name'])) {
$entities = json_decode($chunk['entities'] ?? '[]', true) ?: [];
$found = false;
foreach ($entities as $entity) {
if (isset($entity['name']) && stripos($entity['name'], $filters['entity_name']) !== false) {
$found = true;
break;
}
}
if (!$found) {
continue;
}
}
// Apply entity type filter
if (isset($filters['entity_type'])) {
$entities = json_decode($chunk['entities'] ?? '[]', true) ?: [];
$found = false;
foreach ($entities as $entity) {
if (isset($entity['type']) && strtoupper($entity['type']) === strtoupper($filters['entity_type'])) {
$found = true;
break;
}
}
if (!$found) {
continue;
}
}
// Apply keyword filter
if (isset($filters['keyword'])) {
$keywords = json_decode($chunk['keywords'] ?? '[]', true) ?: [];
$found = false;
foreach ($keywords as $kw) {
if (stripos($kw, $filters['keyword']) !== false) {
$found = true;
break;
}
}
if (!$found) {
continue;
}
}
$results[] = [
'chunk_id' => $chunkId,
'doc_id' => (int) $chunk['dokumentation_id'],
'path' => $chunk['doc_path'] ?? '',
'title' => $chunk['doc_title'] ?? '',
'content' => $chunk['content_clean'] ?? $chunk['content'] ?? '',
'heading_path' => json_decode($chunk['heading_path'] ?? '[]', true) ?: [],
'taxonomy' => json_decode($chunk['taxonomy_path'] ?? '[]', true) ?: [],
'entities' => json_decode($chunk['entities'] ?? '[]', true) ?: [],
'keywords' => json_decode($chunk['keywords'] ?? '[]', true) ?: [],
'score' => $vr['score'],
'relevance_score' => $vr['score'], // Will be adjusted in rerank
];
}
return $results;
}
/**
* Re-ranks results based on combined semantic and structural relevance.
*
* @param array<array<string, mixed>> $results
* @return array<array<string, mixed>>
*/
private function rerank(array $results, string $query): array
{
$queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []);
foreach ($results as &$result) {
$boost = 0.0;
// Boost for keyword matches
foreach ($result['keywords'] as $keyword) {
foreach ($queryWords as $word) {
if (stripos($keyword, $word) !== false) {
$boost += 0.05;
}
}
}
// Boost for entity matches
foreach ($result['entities'] as $entity) {
if (isset($entity['name'])) {
foreach ($queryWords as $word) {
if (stripos($entity['name'], $word) !== false) {
$boost += 0.03;
}
}
}
}
// Boost for title matches
foreach ($queryWords as $word) {
if (stripos($result['title'], $word) !== false) {
$boost += 0.1;
}
}
$result['relevance_score'] = min(1.0, $result['score'] + $boost);
}
// Sort by relevance score
usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']);
return $results;
}
/**
* Gets chunk with document data.
*
* @return array<string, mixed>|null
*/
private function getChunkWithDocument(int $chunkId): ?array
{
$stmt = $this->pdo->prepare('
SELECT c.*, d.title as doc_title, d.path as doc_path
FROM dokumentation_chunks c
JOIN dokumentation d ON c.dokumentation_id = d.id
WHERE c.id = :id
');
$stmt->execute(['id' => $chunkId]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/**
* Makes an HTTP request to Qdrant.
*
* @param array<string, mixed> $payload
* @return array<string, mixed>
*/
private function makeRequest(string $url, array $payload, string $method): array
{
$ch = curl_init($url);
if ($ch === false) {
throw new RuntimeException('Failed to initialize cURL');
}
$jsonPayload = json_encode($payload);
if ($jsonPayload === false) {
curl_close($ch);
throw new RuntimeException('Failed to encode JSON payload');
}
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => self::TIMEOUT,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_CUSTOMREQUEST => $method,
CURLOPT_POSTFIELDS => $jsonPayload,
CURLOPT_HTTPHEADER => [
'Content-Type: application/json',
'Content-Length: ' . strlen($jsonPayload),
],
]);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$curlError = curl_error($ch);
curl_close($ch);
if ($result === false) {
throw new RuntimeException(sprintf('cURL request failed: %s', $curlError ?: 'Unknown error'));
}
if ($httpCode >= 400) {
throw new RuntimeException(sprintf('Qdrant API returned HTTP %d', $httpCode));
}
$decoded = json_decode((string) $result, true);
return is_array($decoded) ? $decoded : [];
}
private function createConnection(): PDO
{
return \Infrastructure\Config\DatabaseFactory::dev();
}
}
Vollständig herunterladen
Aktionen
Andere Versionen dieser Datei
| ID |
Version |
Typ |
Größe |
Datum |
| 2016 |
33 |
modified |
12.3 KB |
2025-12-28 14:24 |
| 2015 |
32 |
modified |
13.8 KB |
2025-12-28 14:23 |
| 2014 |
31 |
modified |
14.3 KB |
2025-12-28 14:23 |
| 2013 |
30 |
modified |
14.4 KB |
2025-12-28 14:22 |
| 2006 |
29 |
modified |
12.9 KB |
2025-12-28 14:18 |
| 2003 |
28 |
modified |
12.4 KB |
2025-12-28 14:18 |
| 2002 |
27 |
modified |
12.2 KB |
2025-12-28 14:18 |
| 2001 |
26 |
modified |
11.5 KB |
2025-12-28 14:17 |
| 1974 |
25 |
modified |
11.8 KB |
2025-12-28 02:33 |
| 1973 |
24 |
modified |
11.9 KB |
2025-12-28 02:32 |
| 1972 |
23 |
modified |
12.0 KB |
2025-12-28 02:32 |
| 1971 |
22 |
modified |
12.1 KB |
2025-12-28 02:31 |
| 1970 |
21 |
modified |
12.2 KB |
2025-12-28 02:31 |
| 1969 |
20 |
modified |
12.4 KB |
2025-12-28 02:31 |
| 1968 |
19 |
modified |
12.6 KB |
2025-12-28 02:31 |
| 1967 |
18 |
modified |
12.7 KB |
2025-12-28 02:30 |
| 1966 |
17 |
modified |
12.7 KB |
2025-12-28 02:30 |
| 1965 |
16 |
modified |
12.9 KB |
2025-12-28 02:30 |
| 1964 |
15 |
modified |
13.5 KB |
2025-12-28 02:30 |
| 1510 |
14 |
modified |
13.5 KB |
2025-12-25 18:21 |
| 1509 |
13 |
modified |
13.7 KB |
2025-12-25 18:21 |
| 1502 |
12 |
modified |
13.6 KB |
2025-12-25 17:48 |
| 854 |
11 |
modified |
13.7 KB |
2025-12-23 08:46 |
| 853 |
10 |
modified |
13.8 KB |
2025-12-23 08:46 |
| 787 |
9 |
modified |
14.0 KB |
2025-12-23 08:05 |
| 366 |
8 |
modified |
14.0 KB |
2025-12-22 08:24 |
| 365 |
7 |
modified |
13.9 KB |
2025-12-22 08:24 |
| 332 |
6 |
modified |
13.9 KB |
2025-12-22 08:09 |
| 331 |
5 |
modified |
13.9 KB |
2025-12-22 08:09 |
| 330 |
4 |
modified |
13.9 KB |
2025-12-22 08:09 |
| 329 |
3 |
modified |
13.9 KB |
2025-12-22 08:09 |
| 38 |
2 |
modified |
14.7 KB |
2025-12-20 17:24 |
| 26 |
1 |
modified |
14.7 KB |
2025-12-20 17:17 |
← Zurück zur Übersicht