ContentSearchService.php

Code Hygiene Score: 68

Issues 1

Zeile Typ Beschreibung
- complexity Datei hat 365 Zeilen (max: 350)

Dependencies 10

Klassen 1

Funktionen 15

Verwendet von 1

Code

<?php

declare(strict_types=1);

namespace Infrastructure\Search;

// @responsibility: Semantische Suche über ki_content mit Chunk-Semantik

use Domain\Service\SearchServiceInterface;
use Infrastructure\AI\OllamaService;
use Infrastructure\Config\CredentialService;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
use RuntimeException;

final class ContentSearchService implements SearchServiceInterface
{
    use JsonDecodeTrait;

    private const string COLLECTION = 'documents';
    private const int TIMEOUT = 30;
    private const float MIN_SCORE = 0.3;

    private string $qdrantHost;

    public function __construct(
        private PDO $pdo,
        private OllamaService $ollama
    ) {
        $this->qdrantHost = CredentialService::getQdrantHost();
    }

    /** Hybrid search combining semantic vectors with SQL filters. */
    public function search(string $query, array $filters = [], int $limit = 10): array
    {
        // Stage 1: Semantic search in Qdrant
        $vectorResults = $this->semanticSearch($query, $limit * 3);

        if (empty($vectorResults)) {
            return [];
        }

        // Stage 2: Enrich with SQL data (chunks + semantics)
        $enrichedResults = $this->enrichWithSemantics($vectorResults, $filters);

        // Stage 3: Re-rank based on semantic relevance
        $rankedResults = $this->rerank($enrichedResults, $query);

        return array_slice($rankedResults, 0, $limit);
    }

    /** Searches for definition chunks only. */
    public function searchDefinitions(string $query, int $limit = 10): array
    {
        return $this->search($query, ['discourse_role' => 'definition'], $limit);
    }

    /** Searches for chunks with specific intent. */
    public function searchByIntent(string $query, string $intent, int $limit = 10): array
    {
        return $this->search($query, ['intent' => $intent], $limit);
    }

    /** Searches within a taxonomy path. */
    public function searchByTaxonomy(string $query, string $taxonomyPath, int $limit = 10): array
    {
        return $this->search($query, ['taxonomy_path' => $taxonomyPath], $limit);
    }

    /** Suggests related searches based on current results. */
    public function suggestRelatedSearches(array $results): array
    {
        $suggestions = [];

        foreach ($results as $result) {
            foreach ($result['keywords'] ?? [] as $keyword) {
                if (!in_array($keyword, $suggestions, true) && count($suggestions) < 5) {
                    $suggestions[] = $keyword;
                }
            }
        }

        return $suggestions;
    }

    /** Gets semantic statistics for the content. */
    public function getSemanticStats(): array
    {
        $stmt = $this->pdo->query('
            SELECT
                cs.intent,
                cs.discourse_role,
                COUNT(*) as count
            FROM ki_content.chunk_semantics cs
            WHERE cs.intent IS NOT NULL
            GROUP BY cs.intent, cs.discourse_role
            ORDER BY count DESC
        ');

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /** Performs semantic search in Qdrant. */
    private function semanticSearch(string $query, int $limit): array
    {
        $embedding = $this->ollama->getEmbedding($query);

        $url = sprintf('%s/collections/%s/points/search', $this->qdrantHost, self::COLLECTION);

        $payload = [
            'vector' => array_values($embedding),
            'limit' => $limit,
            'with_payload' => true,
        ];

        try {
            $response = $this->makeRequest($url, $payload);

            if (!isset($response['result']) || !is_array($response['result'])) {
                return [];
            }

            return array_map(static function (array $item): array {
                return [
                    'qdrant_id' => (string) $item['id'],
                    'score' => (float) ($item['score'] ?? 0),
                    'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [],
                ];
            }, $response['result']);
        } catch (RuntimeException) {
            return [];
        }
    }

    /** Enriches vector results with chunk semantics from database. */
    private function enrichWithSemantics(array $vectorResults, array $filters): array
    {
        $results = [];
        $minScore = $filters['min_score'] ?? self::MIN_SCORE;

        foreach ($vectorResults as $vr) {
            if ($vr['score'] < $minScore) {
                continue;
            }

            $qdrantId = $vr['qdrant_id'];
            $chunkData = $this->getChunkWithSemantics($qdrantId);

            if ($chunkData === null) {
                continue;
            }

            // Apply filters
            if (!$this->matchesFilters($chunkData, $filters)) {
                continue;
            }

            $results[] = [
                'chunk_id' => (int) $chunkData['chunk_id'],
                'document_id' => (int) $chunkData['document_id'],
                'source_path' => $chunkData['source_path'] ?? '',
                'content' => $chunkData['content'] ?? '',
                'heading_path' => $chunkData['heading_path'] ?? '',
                // Semantic data
                'summary' => $chunkData['summary'] ?? null,
                'keywords' => $this->decodeJsonArray($chunkData['keywords'] ?? null),
                'sentiment' => $chunkData['sentiment'] ?? 'neutral',
                'intent' => $chunkData['intent'] ?? null,
                'discourse_role' => $chunkData['discourse_role'] ?? null,
                'statement_form' => $chunkData['statement_form'] ?? null,
                'frame' => $chunkData['frame'] ?? null,
                // Scores
                'score' => $vr['score'],
                'relevance_score' => $vr['score'],
            ];
        }

        return $results;
    }

    /** Gets chunk with semantic data from ki_content. */
    private function getChunkWithSemantics(string $qdrantId): ?array
    {
        $stmt = $this->pdo->prepare('
            SELECT
                c.id as chunk_id,
                c.document_id,
                c.content,
                c.heading_path,
                d.source_path,
                cs.summary,
                cs.keywords,
                cs.sentiment,
                cs.intent,
                cs.discourse_role,
                cs.statement_form,
                cs.frame
            FROM ki_content.chunks c
            JOIN ki_content.documents d ON c.document_id = d.id
            LEFT JOIN ki_content.chunk_semantics cs ON c.id = cs.chunk_id
            WHERE c.qdrant_id = :qdrant_id
        ');
        $stmt->execute(['qdrant_id' => $qdrantId]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }

    /** Checks if chunk matches filters. */
    private function matchesFilters(array $chunk, array $filters): bool
    {
        if (isset($filters['intent']) && ($chunk['intent'] ?? null) !== $filters['intent']) {
            return false;
        }
        if (isset($filters['discourse_role']) && ($chunk['discourse_role'] ?? null) !== $filters['discourse_role']) {
            return false;
        }
        if (isset($filters['sentiment']) && ($chunk['sentiment'] ?? null) !== $filters['sentiment']) {
            return false;
        }
        if (isset($filters['frame']) && ($chunk['frame'] ?? null) !== $filters['frame']) {
            return false;
        }
        if (isset($filters['taxonomy_path'])) {
            // Check if chunk is in taxonomy path via chunk_taxonomy
            $hasMatch = $this->chunkMatchesTaxonomy((int) $chunk['chunk_id'], $filters['taxonomy_path']);
            if (!$hasMatch) {
                return false;
            }
        }

        return true;
    }

    /** Checks if chunk belongs to taxonomy path. */
    private function chunkMatchesTaxonomy(int $chunkId, string $taxonomyPath): bool
    {
        $stmt = $this->pdo->prepare('
            SELECT 1 FROM ki_content.chunk_taxonomy ct
            JOIN ki_content.taxonomy_terms tt ON ct.taxonomy_term_id = tt.id
            WHERE ct.chunk_id = :chunk_id AND tt.path LIKE :path
            LIMIT 1
        ');
        $stmt->execute([
            'chunk_id' => $chunkId,
            'path' => $taxonomyPath . '%',
        ]);

        return $stmt->fetch() !== false;
    }

    /** Re-ranks results based on semantic relevance. */
    private function rerank(array $results, string $query): array
    {
        $queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []);
        $isDefinitionQuery = $this->isDefinitionQuery($query);

        foreach ($results as &$result) {
            $boost = 0.0;

            // Keyword matching
            foreach ($result['keywords'] as $kw) {
                foreach ($queryWords as $w) {
                    if (stripos($kw, $w) !== false) {
                        $boost += 0.05;
                    }
                }
            }

            // Semantic boost: discourse role
            $discourseRole = $result['discourse_role'] ?? null;
            if ($isDefinitionQuery && $discourseRole === 'definition') {
                $boost += 0.15;
            } elseif ($discourseRole === 'thesis') {
                $boost += 0.08;
            } elseif ($discourseRole === 'evidence') {
                $boost += 0.05;
            }

            // Semantic boost: intent
            $intent = $result['intent'] ?? null;
            if ($intent === 'explain' || $intent === 'define') {
                $boost += 0.05;
            }

            // Summary matching
            if ($result['summary'] !== null) {
                foreach ($queryWords as $w) {
                    if (stripos($result['summary'], $w) !== false) {
                        $boost += 0.03;
                    }
                }
            }

            $result['relevance_score'] = min(1.0, $result['score'] + $boost);
        }

        usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']);

        return $results;
    }

    /** Detects if query is asking for a definition. */
    private function isDefinitionQuery(string $query): bool
    {
        $patterns = [
            '/^was\s+ist\b/i',
            '/^was\s+sind\b/i',
            '/^was\s+bedeutet\b/i',
            '/^definition\b/i',
            '/^erkl[äa]r/i',
        ];

        foreach ($patterns as $pattern) {
            if (preg_match($pattern, $query)) {
                return true;
            }
        }

        return false;
    }

    /** Makes HTTP request to Qdrant. */
    private function makeRequest(string $url, array $payload): array
    {
        $ch = curl_init($url);

        if ($ch === false) {
            throw new RuntimeException('Failed to initialize cURL');
        }

        $jsonPayload = json_encode($payload);

        if ($jsonPayload === false) {
            curl_close($ch);

            throw new RuntimeException('Failed to encode JSON');
        }

        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => self::TIMEOUT,
            CURLOPT_CONNECTTIMEOUT => 10,
            CURLOPT_POST => true,
            CURLOPT_POSTFIELDS => $jsonPayload,
            CURLOPT_HTTPHEADER => [
                'Content-Type: application/json',
                'Content-Length: ' . strlen($jsonPayload),
            ],
        ]);

        $result = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);

        if ($result === false || $httpCode >= 400) {
            throw new RuntimeException(sprintf('Qdrant request failed: HTTP %d', $httpCode));
        }

        $decoded = json_decode((string) $result, true);

        return is_array($decoded) ? $decoded : [];
    }
}
← Übersicht Graph