HybridSearchService.php

Code Hygiene Score: 68

Issues 1

Zeile Typ Beschreibung
- complexity Datei hat 353 Zeilen (max: 350)

Dependencies 10

Klassen 1

Funktionen 16

Verwendet von 4

Versionen 33

Code

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Hybrid-Suche kombiniert Qdrant-Vektoren mit SQL-Filtern

use Domain\Service\SearchServiceInterface;
use Infrastructure\AI\OllamaService;
use Infrastructure\AI\QdrantClient;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;

final class HybridSearchService implements SearchServiceInterface
{
    use JsonDecodeTrait;

    private const string COLLECTION = 'dokumentation_chunks';

    public function __construct(
        private PDO $pdo,
        private OllamaService $ollama,
        private QdrantClient $qdrant
    ) {
    }

    /** Hybrid search combining semantic vectors with SQL filters. */
    public function search(string $query, array $filters = [], int $limit = 10): array
    {
        // Stage 1: Semantic search in Qdrant
        $vectorResults = $this->semanticSearch($query, $filters, $limit * 3);

        if (empty($vectorResults)) {
            return [];
        }

        // Stage 2: Enrich with SQL data and apply filters
        $enrichedResults = $this->enrichAndFilter($vectorResults, $filters);

        // Stage 3: Re-rank based on combined score
        $rankedResults = $this->rerank($enrichedResults, $query);

        return array_slice($rankedResults, 0, $limit);
    }

    /** Searches within a specific taxonomy category. */
    public function searchByCategory(string $query, string $category, int $limit = 10): array
    {
        return $this->search($query, ['taxonomy_category' => $category], $limit);
    }

    /** Searches for chunks containing a specific entity. */
    public function searchByEntity(string $query, string $entityName, int $limit = 10): array
    {
        return $this->search($query, ['entity_name' => $entityName], $limit);
    }

    /** Searches for chunks with specific intent (explain, argue, define, etc.). */
    public function searchByIntent(string $query, string $intent, int $limit = 10): array
    {
        return $this->search($query, ['intent' => $intent], $limit);
    }

    /** Searches for definition chunks only. */
    public function searchDefinitions(string $query, int $limit = 10): array
    {
        return $this->search($query, ['discourse_role' => 'definition'], $limit);
    }

    /** Searches for evidence/example chunks for a topic. */
    public function searchEvidence(string $query, int $limit = 10): array
    {
        return $this->search($query, ['discourse_role' => 'evidence'], $limit);
    }

    /** Gets all available taxonomy categories with counts. */
    public function getTaxonomyCategories(): array
    {
        $stmt = $this->pdo->query('
            SELECT taxonomy_category as category, COUNT(*) as count
            FROM dokumentation_chunks
            WHERE taxonomy_category IS NOT NULL
            GROUP BY taxonomy_category
            ORDER BY count DESC
        ');

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /** Gets all entities grouped by type. */
    public function getEntitiesByType(): array
    {
        $stmt = $this->pdo->query("
            SELECT entities FROM dokumentation_chunks
            WHERE entities IS NOT NULL AND entities != '[]'
        ");

        $byType = [];

        foreach ($stmt->fetchAll(PDO::FETCH_ASSOC) as $row) {
            $entities = $this->decodeJsonArray($row['entities'] ?? null);
            foreach ($entities as $entity) {
                if (isset($entity['name'], $entity['type'])) {
                    $type = $entity['type'];
                    if (!isset($byType[$type])) {
                        $byType[$type] = [];
                    }
                    if (!in_array($entity['name'], $byType[$type], true)) {
                        $byType[$type][] = $entity['name'];
                    }
                }
            }
        }

        return $byType;
    }

    /** Suggests related searches based on current results. */
    public function suggestRelatedSearches(array $results): array
    {
        $suggestions = [];

        foreach ($results as $result) {
            // Add keywords from results
            foreach ($result['keywords'] ?? [] as $keyword) {
                if (!in_array($keyword, $suggestions, true)) {
                    $suggestions[] = $keyword;
                }
            }

            // Add entity names
            foreach ($result['entities'] ?? [] as $entity) {
                if (isset($entity['name']) && !in_array($entity['name'], $suggestions, true)) {
                    $suggestions[] = $entity['name'];
                }
            }
        }

        return array_slice($suggestions, 0, 5);
    }

    /** Performs semantic search in Qdrant. */
    private function semanticSearch(string $query, array $filters, int $limit): array
    {
        $embedding = $this->ollama->getEmbedding($query);

        // Build Qdrant filter if taxonomy category specified
        $qdrantFilter = null;
        if (isset($filters['taxonomy_category'])) {
            $qdrantFilter = [
                'must' => [
                    ['key' => 'taxonomy_category', 'match' => ['value' => $filters['taxonomy_category']]],
                ],
            ];
        }

        $results = $this->qdrant->search(self::COLLECTION, $embedding, $limit, $qdrantFilter);

        return array_map(static fn (array $item): array => [
            'id' => (string) $item['id'],
            'score' => (float) ($item['score'] ?? 0),
            'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [],
        ], $results);
    }

    /** Enriches vector results with SQL data and applies filters. */
    private function enrichAndFilter(array $vectorResults, array $filters): array
    {
        $results = [];
        $minScore = $filters['min_score'] ?? 0.3;
        foreach ($vectorResults as $vr) {
            if ($vr['score'] < $minScore) {
                continue;
            }
            $chunkId = (int) ($vr['payload']['chunk_id'] ?? 0);
            if ($chunkId === 0) {
                continue;
            }
            $chunk = $this->getChunkWithDocument($chunkId);
            if ($chunk === null) {
                continue;
            }
            // Apply entity/keyword filters
            if (!$this->matchesFilters($chunk, $filters)) {
                continue;
            }
            $results[] = [
                'chunk_id' => $chunkId, 'doc_id' => (int) $chunk['dokumentation_id'],
                'path' => $chunk['doc_path'] ?? '', 'title' => $chunk['doc_title'] ?? '',
                'content' => $chunk['content_clean'] ?? $chunk['content'] ?? '',
                'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),
                'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),
                'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),
                'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),
                // Semantic metadata
                'summary' => $chunk['summary'] ?? null,
                'sentiment' => $chunk['sentiment'] ?? 'neutral',
                'intent' => $chunk['intent'] ?? null,
                'discourse_role' => $chunk['discourse_role'] ?? null,
                'score' => $vr['score'], 'relevance_score' => $vr['score'],
            ];
        }

        return $results;
    }

    /** Checks if chunk matches entity/keyword filters. */
    private function matchesFilters(array $chunk, array $filters): bool
    {
        if (isset($filters['entity_name'])) {
            $entities = $this->decodeJsonArray($chunk['entities'] ?? null);
            $found = false;
            foreach ($entities as $e) {
                if (isset($e['name']) && stripos($e['name'], $filters['entity_name']) !== false) {
                    $found = true;
                    break;
                }
            }
            if (!$found) {
                return false;
            }
        }
        if (isset($filters['entity_type'])) {
            $entities = $this->decodeJsonArray($chunk['entities'] ?? null);
            $found = false;
            foreach ($entities as $e) {
                if (isset($e['type']) && strtoupper($e['type']) === strtoupper($filters['entity_type'])) {
                    $found = true;
                    break;
                }
            }
            if (!$found) {
                return false;
            }
        }
        if (isset($filters['keyword'])) {
            $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);
            $found = false;
            foreach ($keywords as $kw) {
                if (stripos($kw, $filters['keyword']) !== false) {
                    $found = true;
                    break;
                }
            }
            if (!$found) {
                return false;
            }
        }
        // Semantic filters
        if (isset($filters['intent']) && ($chunk['intent'] ?? null) !== $filters['intent']) {
            return false;
        }
        if (isset($filters['discourse_role']) && ($chunk['discourse_role'] ?? null) !== $filters['discourse_role']) {
            return false;
        }
        if (isset($filters['sentiment']) && ($chunk['sentiment'] ?? null) !== $filters['sentiment']) {
            return false;
        }

        return true;
    }

    /** Re-ranks results based on combined semantic and structural relevance. */
    private function rerank(array $results, string $query): array
    {
        $queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []);
        $isDefinitionQuery = $this->isDefinitionQuery($query);

        foreach ($results as &$result) {
            $boost = 0.0;

            // Keyword matching boost
            foreach ($result['keywords'] as $kw) {
                foreach ($queryWords as $w) {
                    if (stripos($kw, $w) !== false) {
                        $boost += 0.05;
                    }
                }
            }

            // Entity matching boost
            foreach ($result['entities'] as $e) {
                if (isset($e['name'])) {
                    foreach ($queryWords as $w) {
                        if (stripos($e['name'], $w) !== false) {
                            $boost += 0.03;
                        }
                    }
                }
            }

            // Title matching boost
            foreach ($queryWords as $w) {
                if (stripos($result['title'], $w) !== false) {
                    $boost += 0.1;
                }
            }

            // Semantic boost based on discourse role
            $discourseRole = $result['discourse_role'] ?? null;
            if ($isDefinitionQuery && $discourseRole === 'definition') {
                $boost += 0.15;  // Strong boost for definitions when asking "was ist"
            } elseif ($discourseRole === 'thesis') {
                $boost += 0.08;  // Thesis statements are valuable
            } elseif ($discourseRole === 'evidence') {
                $boost += 0.05;  // Evidence supports claims
            }

            // Intent boost - explanations are generally more useful
            $intent = $result['intent'] ?? null;
            if ($intent === 'explain' || $intent === 'define') {
                $boost += 0.05;
            }

            $result['relevance_score'] = min(1.0, $result['score'] + $boost);
        }

        usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']);

        return $results;
    }

    /** Detects if query is asking for a definition. */
    private function isDefinitionQuery(string $query): bool
    {
        $patterns = ['/^was\s+ist\b/i', '/^was\s+sind\b/i', '/^was\s+bedeutet\b/i',
                     '/^definition\b/i', '/^erkl[äa]r/i', '/^beschreib/i'];
        foreach ($patterns as $p) {
            if (preg_match($p, $query)) {
                return true;
            }
        }

        return false;
    }

    /** Gets chunk with document data. */
    private function getChunkWithDocument(int $chunkId): ?array
    {
        $stmt = $this->pdo->prepare('
            SELECT c.*, d.title as doc_title, d.path as doc_path
            FROM dokumentation_chunks c
            JOIN dokumentation d ON c.dokumentation_id = d.id
            WHERE c.id = :id
        ');
        $stmt->execute(['id' => $chunkId]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }
}
← Übersicht Graph