RagContextBuilder.php

Code Hygiene Score: 97

Keine Issues gefunden.

Dependencies 2

Klassen 1

Funktionen 5

Verwendet von 2

Code

<?php

declare(strict_types=1);

namespace UseCases\Chat;

// @responsibility: Baut RAG-Kontext aus semantischer Suche

use Domain\Service\SearchServiceInterface;

final class RagContextBuilder
{
    private const MAX_CONTEXT_TOKENS = 3000;
    private const CHARS_PER_TOKEN = 4;

    public function __construct(
        private SearchServiceInterface $searchService
    ) {
    }

    /**
     * Search with semantic enrichment.
     *
     * @param array<string> $collections
     * @return array<array<string, mixed>>
     *
     * @throws \RuntimeException When search fails
     */
    public function search(string $query, array $collections, int $limit): array
    {
        if ($collections === [] || !in_array('documents', $collections, true)) {
            return [];
        }

        $results = $this->searchService->search($query, [], $limit);
        $formatted = [];

        foreach ($results as $result) {
            $formatted[] = [
                'chunk_id' => $result['chunk_id'],
                'content' => $result['content'],
                'title' => $result['source_path'] ?? $result['heading_path'] ?? 'Unbekannt',
                'score' => $result['relevance_score'],
                'summary' => $result['summary'] ?? null,
                'keywords' => $result['keywords'] ?? [],
                'intent' => $result['intent'] ?? null,
                'discourse_role' => $result['discourse_role'] ?? null,
                'sentiment' => $result['sentiment'] ?? null,
                'frame' => $result['frame'] ?? null,
                '_collection' => 'documents',
            ];
        }

        return $formatted;
    }

    /**
     * Build context string from search results.
     *
     * @param array<array<string, mixed>> $searchResults
     */
    public function buildContext(array $searchResults): string
    {
        if ($searchResults === []) {
            return '';
        }

        $contextParts = [];
        $totalChars = 0;
        $maxChars = self::MAX_CONTEXT_TOKENS * self::CHARS_PER_TOKEN;

        foreach ($searchResults as $index => $result) {
            $content = (string) ($result['content'] ?? '');
            $title = (string) ($result['title'] ?? 'Unbekannt');

            if ($totalChars + strlen($content) > $maxChars) {
                break;
            }

            $header = $this->buildSemanticHeader($index + 1, $title, $result);
            $contextParts[] = $header . "\n" . $content;
            $totalChars += strlen($content);
        }

        return implode("\n\n---\n\n", $contextParts);
    }

    /**
     * Extract deduplicated sources from results.
     *
     * @param array<array<string, mixed>> $searchResults
     * @return array<array<string, mixed>>
     */
    public function extractSources(array $searchResults): array
    {
        $sources = [];
        $seen = [];

        foreach ($searchResults as $result) {
            $title = (string) ($result['title'] ?? '');
            if ($title === '' || isset($seen[$title])) {
                continue;
            }

            $source = [
                'title' => $title,
                'score' => round((float) ($result['score'] ?? 0), 3),
            ];

            $content = $result['content'] ?? null;
            if (is_string($content) && $content !== '') {
                $source['content'] = mb_substr($content, 0, 300);
            }

            if (isset($result['_collection'])) {
                $source['collection'] = $result['_collection'];
            }

            if (isset($result['discourse_role'])) {
                $source['discourse_role'] = $result['discourse_role'];
            }
            if (isset($result['intent'])) {
                $source['intent'] = $result['intent'];
            }
            if (isset($result['summary'])) {
                $source['summary'] = $result['summary'];
            }

            $sources[] = $source;
            $seen[$title] = true;
        }

        return $sources;
    }

    /**
     * Build semantic header for context chunk.
     *
     * @param array<string, mixed> $result
     */
    private function buildSemanticHeader(int $index, string $title, array $result): string
    {
        $semanticInfo = [];

        if (isset($result['discourse_role'])) {
            $semanticInfo[] = match ($result['discourse_role']) {
                'definition' => 'Definition',
                'thesis' => 'These',
                'evidence' => 'Beleg',
                'example' => 'Beispiel',
                'summary' => 'Zusammenfassung',
                default => $result['discourse_role'],
            };
        }

        if (isset($result['intent'])) {
            $semanticInfo[] = match ($result['intent']) {
                'explain' => 'erklärt',
                'define' => 'definiert',
                'argue' => 'argumentiert',
                'compare' => 'vergleicht',
                'exemplify' => 'veranschaulicht',
                default => $result['intent'],
            };
        }

        $header = sprintf('[Quelle %d: %s]', $index, $title);
        if ($semanticInfo !== []) {
            $header .= ' (' . implode(', ', $semanticInfo) . ')';
        }

        return $header;
    }
}
← Übersicht Graph