Backup #1399

ID1399
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkAnalysisService.php
Version12
Typ modified
Größe12.4 KB
Hash2d33c68f918813b671cead0c8c84530dada01669ce55d9365f3e0aaf34ec3d2a
Datum2025-12-25 16:58:22
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Orchestriert Chunk-Analyse (koordiniert ChunkAnalyzer + ChunkRepository)

use RuntimeException;

final class ChunkAnalysisService implements ChunkProcessorInterface
{
    private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat';
    private const int BATCH_SIZE = 10;

    public function __construct(
        private ChunkRepository $repository,
        private ChunkAnalyzer $analyzer
    ) {
    }

    /**
     * Analyzes a single chunk.
     *
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    public function analyzeChunk(int $chunkId): array
    {
        $chunk = $this->getChunk($chunkId);

        if ($chunk === null) {
            throw new RuntimeException("Chunk #{$chunkId} not found");
        }

        // Mark as processing
        $this->updateStatus($chunkId, 'processing');

        try {
            // Get document context
            $docContext = $this->getDocumentContext((int) $chunk['dokumentation_id']);

            // Perform analysis
            $analysis = $this->performAnalysis($chunk, $docContext);

            // Store results
            $this->storeAnalysisResults($chunkId, $analysis);

            return $analysis;
        } catch (RuntimeException $e) {
            $this->updateStatus($chunkId, 'failed', $e->getMessage());

            throw $e;
        }
    }

    /**
     * Analyzes all pending chunks in batches.
     *
     * @return array{analyzed: int, failed: int, errors: array<string>}
     */
    public function analyzeAllPending(int $limit = 100): array
    {
        $results = ['analyzed' => 0, 'failed' => 0, 'errors' => []];

        $chunks = $this->getPendingChunks($limit);

        foreach ($chunks as $chunk) {
            try {
                $this->analyzeChunk((int) $chunk['id']);
                $results['analyzed']++;

                // Progress output
                if ($results['analyzed'] % self::BATCH_SIZE === 0) {
                    echo "Analyzed {$results['analyzed']} chunks...\n";
                }
            } catch (RuntimeException $e) {
                $results['failed']++;
                $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
            }
        }

        return $results;
    }

    /**
     * Performs the actual LLM analysis.
     *
     * @param array<string, mixed> $chunk
     * @param array<string, mixed> $docContext
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    private function performAnalysis(array $chunk, array $docContext): array
    {
        $content = $chunk['content_clean'] ?? $chunk['content'];
        $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);

        // Build context
        $context = sprintf(
            "Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s",
            $docContext['title'],
            $docContext['path'],
            implode(' > ', $headingPath),
            $content
        );

        // Combined analysis prompt for efficiency
        $prompt = $this->buildAnalysisPrompt($context);

        $response = $this->callLlmWithRetry($prompt, self::TAXONOMY_MODEL);
        $analysis = $this->parseAnalysisResponse($response);

        // Fallback: If no taxonomy, derive from document path
        if (empty($analysis['taxonomy'])) {
            $analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']);
        }

        return $analysis;
    }

    /**
     * Builds the analysis prompt.
     */
    private function buildAnalysisPrompt(string $context): string
    {
        return <<<PROMPT
            Analysiere den folgenden technischen Dokumentationsabschnitt und extrahiere strukturierte Informationen.

            {$context}

            Antworte NUR mit einem JSON-Objekt in diesem exakten Format (keine Erklärungen):
            {
              "taxonomy": ["Hauptkategorie", "Unterkategorie", "Thema"],
              "entities": [
                {"name": "Entitätsname", "type": "TECHNOLOGY|CONCEPT|CONFIG|COMMAND|SERVICE"}
              ],
              "keywords": ["keyword1", "keyword2", "keyword3"]
            }

            Regeln:
            - taxonomy: Hierarchische Klassifikation (3 Ebenen: Bereich > Modul > Thema)
            - entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste
            - keywords: 3-5 relevante Suchbegriffe
            - Antworte NUR mit dem JSON, keine anderen Texte
            PROMPT;
    }

    /**
     * Calls the LLM with retry logic.
     */
    private function callLlmWithRetry(string $prompt, string $model): string
    {
        $lastError = new RuntimeException('No attempts made');

        for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) {
            try {
                return $this->ollama->generate($prompt, $model);
            } catch (RuntimeException $e) {
                $lastError = $e;
                if ($attempt < self::MAX_RETRIES) {
                    usleep(500000 * $attempt); // Progressive backoff
                }
            }
        }

        throw new RuntimeException('LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . $lastError->getMessage());
    }

    /**
     * Parses the LLM response into structured data.
     *
     * @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
     */
    private function parseAnalysisResponse(string $response): array
    {
        $default = [
            'taxonomy' => [],
            'entities' => [],
            'keywords' => [],
        ];

        // Extract JSON from response (handle markdown code blocks)
        $json = $response;
        if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) {
            $json = $matches[1];
        } elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) {
            $json = $matches[0];
        }

        $decoded = json_decode($json, true);

        if (!is_array($decoded)) {
            return $default;
        }

        return [
            'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? [], 'string'),
            'entities' => $this->validateEntities($decoded['entities'] ?? []),
            'keywords' => $this->validateArray($decoded['keywords'] ?? [], 'string'),
        ];
    }

    /**
     * Validates an array of strings.
     *
     * @param mixed $arr
     * @return array<string>
     */
    private function validateArray(mixed $arr, string $type): array
    {
        if (!is_array($arr)) {
            return [];
        }

        return array_values(array_filter($arr, static fn ($item): bool => is_string($item) && trim($item) !== ''));
    }

    /**
     * Validates entities array.
     *
     * @param mixed $entities
     * @return array<array{name: string, type: string}>
     */
    private function validateEntities(mixed $entities): array
    {
        if (!is_array($entities)) {
            return [];
        }

        $result = [];
        foreach ($entities as $entity) {
            if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) {
                $result[] = [
                    'name' => trim($entity['name']),
                    'type' => isset($entity['type']) && is_string($entity['type']) ? strtoupper($entity['type']) : 'OTHER',
                ];
            }
        }

        return $result;
    }

    /**
     * Derives taxonomy from document path.
     *
     * @return array<string>
     */
    private function deriveTaxonomyFromPath(string $path): array
    {
        $parts = array_filter(explode('/', trim($path, '/')));

        // Map common paths to categories
        $mapping = [
            'server' => 'Server',
            'modul' => 'Module',
            'anwendungen' => 'Anwendungen',
            'mcp' => 'MCP-Server',
            'ki-tasks' => 'KI-Tasks',
        ];

        $taxonomy = [];
        foreach ($parts as $part) {
            $taxonomy[] = $mapping[$part] ?? ucfirst($part);
        }

        return array_slice($taxonomy, 0, 3);
    }

    /**
     * Stores analysis results in the database.
     *
     * @param array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>} $analysis
     */
    private function storeAnalysisResults(int $chunkId, array $analysis): void
    {
        $taxonomyPath = $analysis['taxonomy'];
        $taxonomyCategory = !empty($taxonomyPath) ? $taxonomyPath[0] : null;

        $sql = "UPDATE dokumentation_chunks SET
                taxonomy_category = :category,
                taxonomy_path = :taxonomy,
                entities = :entities,
                keywords = :keywords,
                analysis_model = :model,
                analysis_status = 'completed',
                analysis_error = NULL,
                analyzed_at = NOW()
                WHERE id = :id";

        $stmt = $this->pdo->prepare($sql);
        $stmt->execute([
            'id' => $chunkId,
            'category' => $taxonomyCategory,
            'taxonomy' => json_encode($taxonomyPath),
            'entities' => json_encode($analysis['entities']),
            'keywords' => json_encode($analysis['keywords']),
            'model' => self::TAXONOMY_MODEL,
        ]);
    }

    /**
     * Updates chunk status.
     */
    private function updateStatus(int $chunkId, string $status, ?string $error = null): void
    {
        $sql = 'UPDATE dokumentation_chunks SET analysis_status = :status, analysis_error = :error WHERE id = :id';
        $stmt = $this->pdo->prepare($sql);
        $stmt->execute(['id' => $chunkId, 'status' => $status, 'error' => $error]);
    }

    /**
     * Gets a chunk by ID.
     *
     * @return array<string, mixed>|null
     */
    private function getChunk(int $id): ?array
    {
        $stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
        $stmt->execute(['id' => $id]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }

    /**
     * Gets pending chunks.
     *
     * @return array<array<string, mixed>>
     */
    private function getPendingChunks(int $limit): array
    {
        $stmt = $this->pdo->prepare("
            SELECT * FROM dokumentation_chunks
            WHERE analysis_status = 'pending'
            ORDER BY dokumentation_id, chunk_index
            LIMIT :limit
        ");
        $stmt->bindValue('limit', $limit, PDO::PARAM_INT);
        $stmt->execute();

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * Gets document context.
     *
     * @return array{title: string, path: string}
     */
    private function getDocumentContext(int $docId): array
    {
        $stmt = $this->pdo->prepare('SELECT title, path FROM dokumentation WHERE id = :id');
        $stmt->execute(['id' => $docId]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return [
            'title' => $result['title'] ?? 'Unbekannt',
            'path' => $result['path'] ?? '/',
        ];
    }

    /**
     * Gets analysis statistics.
     *
     * @return array{pending: int, processing: int, completed: int, failed: int, by_category: array<array{category: string, count: int}>}
     */
    public function getStats(): array
    {
        $stmt = $this->pdo->query("
            SELECT
                SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending,
                SUM(CASE WHEN analysis_status = 'processing' THEN 1 ELSE 0 END) as processing,
                SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed,
                SUM(CASE WHEN analysis_status = 'failed' THEN 1 ELSE 0 END) as failed
            FROM dokumentation_chunks
        ");
        $counts = $stmt->fetch(PDO::FETCH_ASSOC);

        $stmt = $this->pdo->query('
            SELECT taxonomy_category as category, COUNT(*) as count
            FROM dokumentation_chunks
            WHERE taxonomy_category IS NOT NULL
            GROUP BY taxonomy_category
            ORDER BY count DESC
        ');
        $byCategory = $stmt->fetchAll(PDO::FETCH_ASSOC);

        return [
            'pending' => (int) ($counts['pending'] ?? 0),
            'processing' => (int) ($counts['processing'] ?? 0),
            'completed' => (int) ($counts['completed'] ?? 0),
            'failed' => (int) ($counts['failed'] ?? 0),
            'by_category' => $byCategory,
        ];
    }
}

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
1864 19 modified 3.5 KB 2025-12-27 23:48
1860 18 modified 3.5 KB 2025-12-27 23:47
1492 17 modified 3.5 KB 2025-12-25 17:28
1441 16 modified 3.5 KB 2025-12-25 17:00
1425 15 modified 4.5 KB 2025-12-25 16:59
1419 14 modified 13.2 KB 2025-12-25 16:59
1406 13 modified 12.8 KB 2025-12-25 16:58
1399 12 modified 12.4 KB 2025-12-25 16:58
1395 11 modified 12.5 KB 2025-12-25 16:58
856 10 modified 12.6 KB 2025-12-23 08:46
855 9 modified 12.7 KB 2025-12-23 08:46
786 8 modified 12.9 KB 2025-12-23 08:05
398 7 modified 12.9 KB 2025-12-22 08:49
397 6 modified 13.0 KB 2025-12-22 08:49
328 5 modified 13.0 KB 2025-12-22 08:08
327 4 modified 12.9 KB 2025-12-22 08:08
326 3 modified 12.9 KB 2025-12-22 08:08
36 2 modified 13.7 KB 2025-12-20 17:23
27 1 modified 13.7 KB 2025-12-20 17:18

← Zurück zur Übersicht