ChunkAnalyzer.php

Pfad: src/Infrastructure/Docs/ChunkAnalyzer.php
Namespace: Infrastructure\Docs
Zeilen: 224 | Größe: 6,604 Bytes
Geändert: 2025-12-25 16:57:23 | Gescannt: 2025-12-31 10:22:15

Code Hygiene Score: 98

Dependencies: 100 (25%)
LOC: 92 (20%)
Methods: 100 (20%)
Secrets: 100 (15%)
Classes: 100 (10%)
Magic Numbers: 100 (10%)

Keine Issues gefunden.

Dependencies 3

constructor Infrastructure\AI\OllamaService
use Infrastructure\AI\OllamaService
use RuntimeException

Klassen 1

ChunkAnalyzer class Zeile 12

Funktionen 9

__construct() public Zeile 17
analyze() public Zeile 29
buildAnalysisPrompt() private Zeile 57
callLlmWithRetry() Zeile 84
parseAnalysisResponse() Zeile 109
validateArray() Zeile 144
validateEntities() Zeile 162
deriveTaxonomyFromPath() Zeile 188
decodeJsonArray() Zeile 213

Verwendet von 2

ChunkAnalysisService.php constructor
InfrastructureServiceProvider.php use

Code

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Analysiert Chunks via LLM (Taxonomie, Entities, Keywords)

use Infrastructure\AI\OllamaService;
use RuntimeException;

final class ChunkAnalyzer
{
    private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat';
    private const int MAX_RETRIES = 3;

    public function __construct(
        private OllamaService $ollama
    ) {
    }

    /**
     * Performs LLM analysis on chunk content.
     *
     * @param array<string, mixed> $chunk
     * @param array{title: string, path: string} $docContext
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    public function analyze(array $chunk, array $docContext): array
    {
        $content = $chunk['content_clean'] ?? $chunk['content'];
        $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);

        $context = sprintf(
            "Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s",
            $docContext['title'],
            $docContext['path'],
            implode(' > ', $headingPath),
            $content
        );

        $prompt = $this->buildAnalysisPrompt($context);
        $response = $this->callLlmWithRetry($prompt);
        $analysis = $this->parseAnalysisResponse($response);

        // Fallback: If no taxonomy, derive from document path
        if (empty($analysis['taxonomy'])) {
            $analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']);
        }

        return $analysis;
    }

    /**
     * Builds the analysis prompt.
     */
    private function buildAnalysisPrompt(string $context): string
    {
        return <<<PROMPT
            Analysiere den folgenden technischen Dokumentationsabschnitt und extrahiere strukturierte Informationen.

            {$context}

            Antworte NUR mit einem JSON-Objekt in diesem exakten Format (keine Erklärungen):
            {
              "taxonomy": ["Hauptkategorie", "Unterkategorie", "Thema"],
              "entities": [
                {"name": "Entitätsname", "type": "TECHNOLOGY|CONCEPT|CONFIG|COMMAND|SERVICE"}
              ],
              "keywords": ["keyword1", "keyword2", "keyword3"]
            }

            Regeln:
            - taxonomy: Hierarchische Klassifikation (3 Ebenen: Bereich > Modul > Thema)
            - entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste
            - keywords: 3-5 relevante Suchbegriffe
            - Antworte NUR mit dem JSON, keine anderen Texte
            PROMPT;
    }

    /**
     * Calls the LLM with retry logic.
     */
    private function callLlmWithRetry(string $prompt): string
    {
        $lastError = new RuntimeException('No attempts made');

        for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) {
            try {
                return $this->ollama->generate($prompt, self::TAXONOMY_MODEL);
            } catch (RuntimeException $e) {
                $lastError = $e;
                if ($attempt < self::MAX_RETRIES) {
                    usleep(500000 * $attempt); // Progressive backoff
                }
            }
        }

        throw new RuntimeException(
            'LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . $lastError->getMessage()
        );
    }

    /**
     * Parses the LLM response into structured data.
     *
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    private function parseAnalysisResponse(string $response): array
    {
        $default = [
            'taxonomy' => [],
            'entities' => [],
            'keywords' => [],
        ];

        // Extract JSON from response (handle markdown code blocks)
        $json = $response;
        if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) {
            $json = $matches[1];
        } elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) {
            $json = $matches[0];
        }

        $decoded = json_decode($json, true);

        if (!is_array($decoded)) {
            return $default;
        }

        return [
            'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? []),
            'entities' => $this->validateEntities($decoded['entities'] ?? []),
            'keywords' => $this->validateArray($decoded['keywords'] ?? []),
        ];
    }

    /**
     * Validates an array of strings.
     *
     * @param mixed $arr
     * @return array<string>
     */
    private function validateArray(mixed $arr): array
    {
        if (!is_array($arr)) {
            return [];
        }

        return array_values(array_filter(
            $arr,
            static fn ($item): bool => is_string($item) && trim($item) !== ''
        ));
    }

    /**
     * Validates entities array.
     *
     * @param mixed $entities
     * @return array<array{name: string, type: string}>
     */
    private function validateEntities(mixed $entities): array
    {
        if (!is_array($entities)) {
            return [];
        }

        $result = [];
        foreach ($entities as $entity) {
            if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) {
                $result[] = [
                    'name' => trim($entity['name']),
                    'type' => isset($entity['type']) && is_string($entity['type'])
                        ? strtoupper($entity['type'])
                        : 'OTHER',
                ];
            }
        }

        return $result;
    }

    /**
     * Derives taxonomy from document path.
     *
     * @return array<string>
     */
    private function deriveTaxonomyFromPath(string $path): array
    {
        $parts = array_filter(explode('/', trim($path, '/')));

        $mapping = [
            'server' => 'Server',
            'modul' => 'Module',
            'anwendungen' => 'Anwendungen',
            'mcp' => 'MCP-Server',
            'ki-tasks' => 'KI-Tasks',
        ];

        $taxonomy = [];
        foreach ($parts as $part) {
            $taxonomy[] = $mapping[$part] ?? ucfirst($part);
        }

        return array_slice($taxonomy, 0, 3);
    }

    /**
     * Decodes JSON array safely.
     *
     * @return array<string>
     */
    private function decodeJsonArray(?string $json): array
    {
        if ($json === null || $json === '') {
            return [];
        }

        $decoded = json_decode($json, true);

        return is_array($decoded) ? $decoded : [];
    }
}

← Übersicht Graph