ChunkAnalyzer.php

Code Hygiene Score: 98

Keine Issues gefunden.

Dependencies 3

Klassen 1

Funktionen 9

Verwendet von 2

Code

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Analysiert Chunks via LLM (Taxonomie, Entities, Keywords)

use Infrastructure\AI\OllamaService;
use RuntimeException;

final class ChunkAnalyzer
{
    private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat';
    private const int MAX_RETRIES = 3;

    public function __construct(
        private OllamaService $ollama
    ) {
    }

    /**
     * Performs LLM analysis on chunk content.
     *
     * @param array<string, mixed> $chunk
     * @param array{title: string, path: string} $docContext
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    public function analyze(array $chunk, array $docContext): array
    {
        $content = $chunk['content_clean'] ?? $chunk['content'];
        $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);

        $context = sprintf(
            "Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s",
            $docContext['title'],
            $docContext['path'],
            implode(' > ', $headingPath),
            $content
        );

        $prompt = $this->buildAnalysisPrompt($context);
        $response = $this->callLlmWithRetry($prompt);
        $analysis = $this->parseAnalysisResponse($response);

        // Fallback: If no taxonomy, derive from document path
        if (empty($analysis['taxonomy'])) {
            $analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']);
        }

        return $analysis;
    }

    /**
     * Builds the analysis prompt.
     */
    private function buildAnalysisPrompt(string $context): string
    {
        return <<<PROMPT
            Analysiere den folgenden technischen Dokumentationsabschnitt und extrahiere strukturierte Informationen.

            {$context}

            Antworte NUR mit einem JSON-Objekt in diesem exakten Format (keine Erklärungen):
            {
              "taxonomy": ["Hauptkategorie", "Unterkategorie", "Thema"],
              "entities": [
                {"name": "Entitätsname", "type": "TECHNOLOGY|CONCEPT|CONFIG|COMMAND|SERVICE"}
              ],
              "keywords": ["keyword1", "keyword2", "keyword3"]
            }

            Regeln:
            - taxonomy: Hierarchische Klassifikation (3 Ebenen: Bereich > Modul > Thema)
            - entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste
            - keywords: 3-5 relevante Suchbegriffe
            - Antworte NUR mit dem JSON, keine anderen Texte
            PROMPT;
    }

    /**
     * Calls the LLM with retry logic.
     */
    private function callLlmWithRetry(string $prompt): string
    {
        $lastError = new RuntimeException('No attempts made');

        for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) {
            try {
                return $this->ollama->generate($prompt, self::TAXONOMY_MODEL);
            } catch (RuntimeException $e) {
                $lastError = $e;
                if ($attempt < self::MAX_RETRIES) {
                    usleep(500000 * $attempt); // Progressive backoff
                }
            }
        }

        throw new RuntimeException(
            'LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . $lastError->getMessage()
        );
    }

    /**
     * Parses the LLM response into structured data.
     *
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    private function parseAnalysisResponse(string $response): array
    {
        $default = [
            'taxonomy' => [],
            'entities' => [],
            'keywords' => [],
        ];

        // Extract JSON from response (handle markdown code blocks)
        $json = $response;
        if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) {
            $json = $matches[1];
        } elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) {
            $json = $matches[0];
        }

        $decoded = json_decode($json, true);

        if (!is_array($decoded)) {
            return $default;
        }

        return [
            'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? []),
            'entities' => $this->validateEntities($decoded['entities'] ?? []),
            'keywords' => $this->validateArray($decoded['keywords'] ?? []),
        ];
    }

    /**
     * Validates an array of strings.
     *
     * @param mixed $arr
     * @return array<string>
     */
    private function validateArray(mixed $arr): array
    {
        if (!is_array($arr)) {
            return [];
        }

        return array_values(array_filter(
            $arr,
            static fn ($item): bool => is_string($item) && trim($item) !== ''
        ));
    }

    /**
     * Validates entities array.
     *
     * @param mixed $entities
     * @return array<array{name: string, type: string}>
     */
    private function validateEntities(mixed $entities): array
    {
        if (!is_array($entities)) {
            return [];
        }

        $result = [];
        foreach ($entities as $entity) {
            if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) {
                $result[] = [
                    'name' => trim($entity['name']),
                    'type' => isset($entity['type']) && is_string($entity['type'])
                        ? strtoupper($entity['type'])
                        : 'OTHER',
                ];
            }
        }

        return $result;
    }

    /**
     * Derives taxonomy from document path.
     *
     * @return array<string>
     */
    private function deriveTaxonomyFromPath(string $path): array
    {
        $parts = array_filter(explode('/', trim($path, '/')));

        $mapping = [
            'server' => 'Server',
            'modul' => 'Module',
            'anwendungen' => 'Anwendungen',
            'mcp' => 'MCP-Server',
            'ki-tasks' => 'KI-Tasks',
        ];

        $taxonomy = [];
        foreach ($parts as $part) {
            $taxonomy[] = $mapping[$part] ?? ucfirst($part);
        }

        return array_slice($taxonomy, 0, 3);
    }

    /**
     * Decodes JSON array safely.
     *
     * @return array<string>
     */
    private function decodeJsonArray(?string $json): array
    {
        if ($json === null || $json === '') {
            return [];
        }

        $decoded = json_decode($json, true);

        return is_array($decoded) ? $decoded : [];
    }
}
← Übersicht Graph