Backup #26

ID26
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/HybridSearchService.php
Version1
Typ modified
Größe14.7 KB
Hash900930936124ae27b9cbb1e7670f5c45b3e04f00a5c8ec69f16a476e0797a84a
Datum2025-12-20 17:17:51
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

use Infrastructure\AI\OllamaService;
use PDO;
use RuntimeException;

/**
 * Hybrid Search Service combining vector search with SQL filtering.
 *
 * Implements a two-stage search:
 * 1. Semantic search via Qdrant (vector similarity)
 * 2. Structured filtering via MariaDB (taxonomy, entities, keywords)
 *
 * This approach achieves 70-85% precision vs 30-40% for pure vector search.
 */
final class HybridSearchService
{
    private const string QDRANT_HOST = 'http://localhost:6333';
    private const string COLLECTION = 'dokumentation_chunks';
    private const int TIMEOUT = 30;

    private PDO $pdo;
    private OllamaService $ollama;

    public function __construct()
    {
        $this->ollama = new OllamaService();
        $this->pdo = $this->createConnection();
    }

    /**
     * Performs a hybrid search combining semantic and structured filtering.
     *
     * @param string $query The search query
     * @param array{
     *     taxonomy_category?: string,
     *     taxonomy_path?: array<string>,
     *     entity_type?: string,
     *     entity_name?: string,
     *     keyword?: string,
     *     min_score?: float
     * } $filters Optional structured filters
     * @param int $limit Maximum results
     * @return array<array{
     *     chunk_id: int,
     *     doc_id: int,
     *     path: string,
     *     title: string,
     *     content: string,
     *     heading_path: array<string>,
     *     taxonomy: array<string>,
     *     entities: array<mixed>,
     *     keywords: array<string>,
     *     score: float,
     *     relevance_score: float
     * }>
     */
    public function search(string $query, array $filters = [], int $limit = 10): array
    {
        // Stage 1: Semantic search in Qdrant
        $vectorResults = $this->semanticSearch($query, $filters, $limit * 3);

        if (empty($vectorResults)) {
            return [];
        }

        // Stage 2: Enrich with SQL data and apply filters
        $enrichedResults = $this->enrichAndFilter($vectorResults, $filters);

        // Stage 3: Re-rank based on combined score
        $rankedResults = $this->rerank($enrichedResults, $query);

        return array_slice($rankedResults, 0, $limit);
    }

    /**
     * Searches within a specific taxonomy category.
     *
     * @return array<array<string, mixed>>
     */
    public function searchByCategory(string $query, string $category, int $limit = 10): array
    {
        return $this->search($query, ['taxonomy_category' => $category], $limit);
    }

    /**
     * Searches for chunks containing a specific entity.
     *
     * @return array<array<string, mixed>>
     */
    public function searchByEntity(string $query, string $entityName, int $limit = 10): array
    {
        return $this->search($query, ['entity_name' => $entityName], $limit);
    }

    /**
     * Gets all available taxonomy categories.
     *
     * @return array<array{category: string, count: int}>
     */
    public function getTaxonomyCategories(): array
    {
        $stmt = $this->pdo->query('
            SELECT taxonomy_category as category, COUNT(*) as count
            FROM dokumentation_chunks
            WHERE taxonomy_category IS NOT NULL
            GROUP BY taxonomy_category
            ORDER BY count DESC
        ');

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * Gets all entities grouped by type.
     *
     * @return array<string, array<string>>
     */
    public function getEntitiesByType(): array
    {
        $stmt = $this->pdo->query("
            SELECT entities FROM dokumentation_chunks
            WHERE entities IS NOT NULL AND entities != '[]'
        ");

        $byType = [];

        foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $row) {
            $entities = json_decode($row['entities'], true) ?: [];
            foreach ($entities as $entity) {
                if (isset($entity['name'], $entity['type'])) {
                    $type = $entity['type'];
                    if (!isset($byType[$type])) {
                        $byType[$type] = [];
                    }
                    if (!in_array($entity['name'], $byType[$type], true)) {
                        $byType[$type][] = $entity['name'];
                    }
                }
            }
        }

        return $byType;
    }

    /**
     * Suggests related searches based on current results.
     *
     * @param array<array<string, mixed>> $results
     * @return array<string>
     */
    public function suggestRelatedSearches(array $results): array
    {
        $suggestions = [];

        foreach ($results as $result) {
            // Add keywords from results
            foreach ($result['keywords'] ?? [] as $keyword) {
                if (!in_array($keyword, $suggestions, true)) {
                    $suggestions[] = $keyword;
                }
            }

            // Add entity names
            foreach ($result['entities'] ?? [] as $entity) {
                if (isset($entity['name']) && !in_array($entity['name'], $suggestions, true)) {
                    $suggestions[] = $entity['name'];
                }
            }
        }

        return array_slice($suggestions, 0, 5);
    }

    /**
     * Performs semantic search in Qdrant.
     *
     * @param array<string, mixed> $filters
     * @return array<array{id: string, score: float, payload: array<string, mixed>}>
     */
    private function semanticSearch(string $query, array $filters, int $limit): array
    {
        $embedding = $this->ollama->getEmbedding($query);

        $url = sprintf('%s/collections/%s/points/search', self::QDRANT_HOST, self::COLLECTION);

        $payload = [
            'vector' => array_values($embedding),
            'limit' => $limit,
            'with_payload' => true,
        ];

        // Add Qdrant filter if taxonomy category specified
        if (isset($filters['taxonomy_category'])) {
            $payload['filter'] = [
                'must' => [
                    [
                        'key' => 'taxonomy_category',
                        'match' => ['value' => $filters['taxonomy_category']],
                    ],
                ],
            ];
        }

        try {
            $response = $this->makeRequest($url, $payload, 'POST');

            if (!isset($response['result']) || !is_array($response['result'])) {
                return [];
            }

            return array_map(static function (array $item): array {
                return [
                    'id' => (string) $item['id'],
                    'score' => (float) ($item['score'] ?? 0),
                    'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [],
                ];
            }, $response['result']);
        } catch (RuntimeException) {
            return [];
        }
    }

    /**
     * Enriches vector results with SQL data and applies additional filters.
     *
     * @param array<array{id: string, score: float, payload: array<string, mixed>}> $vectorResults
     * @param array<string, mixed> $filters
     * @return array<array<string, mixed>>
     */
    private function enrichAndFilter(array $vectorResults, array $filters): array
    {
        $results = [];
        $minScore = $filters['min_score'] ?? 0.3;

        foreach ($vectorResults as $vr) {
            // Apply minimum score filter
            if ($vr['score'] < $minScore) {
                continue;
            }

            $chunkId = (int) ($vr['payload']['chunk_id'] ?? 0);
            if ($chunkId === 0) {
                continue;
            }

            // Get full chunk data from DB
            $chunk = $this->getChunkWithDocument($chunkId);
            if ($chunk === null) {
                continue;
            }

            // Apply entity filter
            if (isset($filters['entity_name'])) {
                $entities = json_decode($chunk['entities'] ?? '[]', true) ?: [];
                $found = false;
                foreach ($entities as $entity) {
                    if (isset($entity['name']) && stripos($entity['name'], $filters['entity_name']) !== false) {
                        $found = true;
                        break;
                    }
                }
                if (!$found) {
                    continue;
                }
            }

            // Apply entity type filter
            if (isset($filters['entity_type'])) {
                $entities = json_decode($chunk['entities'] ?? '[]', true) ?: [];
                $found = false;
                foreach ($entities as $entity) {
                    if (isset($entity['type']) && strtoupper($entity['type']) === strtoupper($filters['entity_type'])) {
                        $found = true;
                        break;
                    }
                }
                if (!$found) {
                    continue;
                }
            }

            // Apply keyword filter
            if (isset($filters['keyword'])) {
                $keywords = json_decode($chunk['keywords'] ?? '[]', true) ?: [];
                $found = false;
                foreach ($keywords as $kw) {
                    if (stripos($kw, $filters['keyword']) !== false) {
                        $found = true;
                        break;
                    }
                }
                if (!$found) {
                    continue;
                }
            }

            $results[] = [
                'chunk_id' => $chunkId,
                'doc_id' => (int) $chunk['dokumentation_id'],
                'path' => $chunk['doc_path'] ?? '',
                'title' => $chunk['doc_title'] ?? '',
                'content' => $chunk['content_clean'] ?? $chunk['content'] ?? '',
                'heading_path' => json_decode($chunk['heading_path'] ?? '[]', true) ?: [],
                'taxonomy' => json_decode($chunk['taxonomy_path'] ?? '[]', true) ?: [],
                'entities' => json_decode($chunk['entities'] ?? '[]', true) ?: [],
                'keywords' => json_decode($chunk['keywords'] ?? '[]', true) ?: [],
                'score' => $vr['score'],
                'relevance_score' => $vr['score'], // Will be adjusted in rerank
            ];
        }

        return $results;
    }

    /**
     * Re-ranks results based on combined semantic and structural relevance.
     *
     * @param array<array<string, mixed>> $results
     * @return array<array<string, mixed>>
     */
    private function rerank(array $results, string $query): array
    {
        $queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []);

        foreach ($results as &$result) {
            $boost = 0.0;

            // Boost for keyword matches
            foreach ($result['keywords'] as $keyword) {
                foreach ($queryWords as $word) {
                    if (stripos($keyword, $word) !== false) {
                        $boost += 0.05;
                    }
                }
            }

            // Boost for entity matches
            foreach ($result['entities'] as $entity) {
                if (isset($entity['name'])) {
                    foreach ($queryWords as $word) {
                        if (stripos($entity['name'], $word) !== false) {
                            $boost += 0.03;
                        }
                    }
                }
            }

            // Boost for title matches
            foreach ($queryWords as $word) {
                if (stripos($result['title'], $word) !== false) {
                    $boost += 0.1;
                }
            }

            $result['relevance_score'] = min(1.0, $result['score'] + $boost);
        }

        // Sort by relevance score
        usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']);

        return $results;
    }

    /**
     * Gets chunk with document data.
     *
     * @return array<string, mixed>|null
     */
    private function getChunkWithDocument(int $chunkId): ?array
    {
        $stmt = $this->pdo->prepare('
            SELECT c.*, d.title as doc_title, d.path as doc_path
            FROM dokumentation_chunks c
            JOIN dokumentation d ON c.dokumentation_id = d.id
            WHERE c.id = :id
        ');
        $stmt->execute(['id' => $chunkId]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }

    /**
     * Makes an HTTP request to Qdrant.
     *
     * @param array<string, mixed> $payload
     * @return array<string, mixed>
     */
    private function makeRequest(string $url, array $payload, string $method): array
    {
        $ch = curl_init($url);

        if ($ch === false) {
            throw new RuntimeException('Failed to initialize cURL');
        }

        $jsonPayload = json_encode($payload);

        if ($jsonPayload === false) {
            curl_close($ch);

            throw new RuntimeException('Failed to encode JSON payload');
        }

        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => self::TIMEOUT,
            CURLOPT_CONNECTTIMEOUT => 10,
            CURLOPT_CUSTOMREQUEST => $method,
            CURLOPT_POSTFIELDS => $jsonPayload,
            CURLOPT_HTTPHEADER => [
                'Content-Type: application/json',
                'Content-Length: ' . strlen($jsonPayload),
            ],
        ]);

        $result = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $curlError = curl_error($ch);

        curl_close($ch);

        if ($result === false) {
            throw new RuntimeException(sprintf('cURL request failed: %s', $curlError ?: 'Unknown error'));
        }

        if ($httpCode >= 400) {
            throw new RuntimeException(sprintf('Qdrant API returned HTTP %d', $httpCode));
        }

        $decoded = json_decode((string) $result, true);

        return is_array($decoded) ? $decoded : [];
    }

    private function createConnection(): PDO
    {
        $password = $this->getPassword();

        return new PDO(
            'mysql:host=localhost;dbname=ki_content;charset=utf8mb4',
            'root',
            $password,
            [
                PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
                PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
            ]
        );
    }

    private function getPassword(): string
    {
        $file = '/var/www/docs/credentials/credentials.md';
        $content = file_get_contents($file);

        if ($content === false) {
            return '';
        }

        foreach (explode("\n", $content) as $line) {
            if (str_contains($line, 'MariaDB') && str_contains($line, 'root')) {
                $parts = explode('|', $line);
                if (count($parts) >= 4) {
                    return trim($parts[3]);
                }
            }
        }

        return '';
    }
}

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
2016 33 modified 12.3 KB 2025-12-28 14:24
2015 32 modified 13.8 KB 2025-12-28 14:23
2014 31 modified 14.3 KB 2025-12-28 14:23
2013 30 modified 14.4 KB 2025-12-28 14:22
2006 29 modified 12.9 KB 2025-12-28 14:18
2003 28 modified 12.4 KB 2025-12-28 14:18
2002 27 modified 12.2 KB 2025-12-28 14:18
2001 26 modified 11.5 KB 2025-12-28 14:17
1974 25 modified 11.8 KB 2025-12-28 02:33
1973 24 modified 11.9 KB 2025-12-28 02:32
1972 23 modified 12.0 KB 2025-12-28 02:32
1971 22 modified 12.1 KB 2025-12-28 02:31
1970 21 modified 12.2 KB 2025-12-28 02:31
1969 20 modified 12.4 KB 2025-12-28 02:31
1968 19 modified 12.6 KB 2025-12-28 02:31
1967 18 modified 12.7 KB 2025-12-28 02:30
1966 17 modified 12.7 KB 2025-12-28 02:30
1965 16 modified 12.9 KB 2025-12-28 02:30
1964 15 modified 13.5 KB 2025-12-28 02:30
1510 14 modified 13.5 KB 2025-12-25 18:21
1509 13 modified 13.7 KB 2025-12-25 18:21
1502 12 modified 13.6 KB 2025-12-25 17:48
854 11 modified 13.7 KB 2025-12-23 08:46
853 10 modified 13.8 KB 2025-12-23 08:46
787 9 modified 14.0 KB 2025-12-23 08:05
366 8 modified 14.0 KB 2025-12-22 08:24
365 7 modified 13.9 KB 2025-12-22 08:24
332 6 modified 13.9 KB 2025-12-22 08:09
331 5 modified 13.9 KB 2025-12-22 08:09
330 4 modified 13.9 KB 2025-12-22 08:09
329 3 modified 13.9 KB 2025-12-22 08:09
38 2 modified 14.7 KB 2025-12-20 17:24
26 1 modified 14.7 KB 2025-12-20 17:17

← Zurück zur Übersicht