Backup #35

ID35
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkingService.php
Version2
Typ modified
Größe12.6 KB
Hash8edc64eb1f8d8fd0d06bed62e40506be1fbe9c92a4448292c30d45b68f02dce7
Datum2025-12-20 17:23:58
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

use Infrastructure\Persistence\DokumentationRepository;
use PDO;
use RuntimeException;

/**
 * Service for chunking documentation into smaller, embeddable segments.
 *
 * Splits HTML documentation into structural chunks based on headings,
 * preserving hierarchy and context for better semantic embeddings.
 */
final class ChunkingService
{
    private const int MAX_CHUNK_TOKENS = 400;
    private const int MIN_CHUNK_TOKENS = 50;
    private const float CHARS_PER_TOKEN = 4.0;

    private PDO $pdo;
    private DokumentationRepository $repo;

    public function __construct()
    {
        $this->repo = new DokumentationRepository();
        $this->pdo = $this->createConnection();
    }

    /**
     * Chunks a single document and stores in database.
     *
     * @return array{chunks_created: int, tokens_total: int}
     */
    public function chunkDocument(int $docId): array
    {
        $doc = $this->repo->findById($docId);

        if ($doc === null) {
            throw new RuntimeException("Document #{$docId} not found");
        }

        // Delete existing chunks for this document
        $this->deleteChunksForDocument($docId);

        // Parse and chunk the content
        $chunks = $this->parseHtmlToChunks($doc['content'], $doc['title']);

        // Store chunks
        $tokensTotal = 0;
        foreach ($chunks as $index => $chunk) {
            $this->storeChunk($docId, $index, $chunk);
            $tokensTotal += $chunk['token_count'];
        }

        return [
            'chunks_created' => count($chunks),
            'tokens_total' => $tokensTotal,
        ];
    }

    /**
     * Chunks all documents in the hierarchy.
     *
     * @return array{documents: int, chunks: int, tokens: int, errors: array<string>}
     */
    public function chunkAll(): array
    {
        $hierarchy = $this->repo->getHierarchy();
        $results = ['documents' => 0, 'chunks' => 0, 'tokens' => 0, 'errors' => []];

        $this->processHierarchy($hierarchy, $results);

        return $results;
    }

    /**
     * @param array<array<string, mixed>> $items
     * @param array{documents: int, chunks: int, tokens: int, errors: array<string>} $results
     */
    private function processHierarchy(array $items, array &$results): void
    {
        foreach ($items as $item) {
            try {
                $result = $this->chunkDocument((int) $item['id']);
                $results['documents']++;
                $results['chunks'] += $result['chunks_created'];
                $results['tokens'] += $result['tokens_total'];
            } catch (RuntimeException $e) {
                $results['errors'][] = "Doc #{$item['id']}: " . $e->getMessage();
            }

            if (!empty($item['children'])) {
                $this->processHierarchy($item['children'], $results);
            }
        }
    }

    /**
     * Parses HTML content into structured chunks.
     *
     * @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
     */
    private function parseHtmlToChunks(string $html, string $docTitle): array
    {
        $chunks = [];
        $currentHeadingPath = [$docTitle];

        // Strip PHP code if present
        $html = preg_replace('/<\?php.*?\?>/s', '', $html) ?? $html;

        // Split by headings (h1-h4)
        $pattern = '/(<h[1-4][^>]*>.*?<\/h[1-4]>)/is';
        $parts = preg_split($pattern, $html, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);

        if ($parts === false) {
            $parts = [$html];
        }

        $currentContent = '';

        foreach ($parts as $part) {
            // Check if this is a heading
            if (preg_match('/<h([1-4])[^>]*>(.*?)<\/h[1-4]>/is', $part, $matches)) {
                // Save previous content as chunk if substantial
                if (!empty(trim($currentContent))) {
                    $chunk = $this->createChunk($currentContent, $currentHeadingPath);
                    if ($chunk !== null) {
                        $chunks[] = $chunk;
                    }
                }

                // Update heading path based on level
                $level = (int) $matches[1];
                $headingText = strip_tags($matches[2]);
                $headingText = html_entity_decode($headingText, ENT_QUOTES | ENT_HTML5, 'UTF-8');
                $headingText = trim($headingText);

                // Adjust heading path based on level
                $currentHeadingPath = array_slice($currentHeadingPath, 0, $level);
                $currentHeadingPath[$level] = $headingText;

                $currentContent = '';
            } else {
                $currentContent .= $part;
            }
        }

        // Don't forget the last chunk
        if (!empty(trim($currentContent))) {
            $chunk = $this->createChunk($currentContent, $currentHeadingPath);
            if ($chunk !== null) {
                $chunks[] = $chunk;
            }
        }

        // If no chunks created, create one from the whole content
        if (empty($chunks)) {
            $chunk = $this->createChunk($html, [$docTitle]);
            if ($chunk !== null) {
                $chunks[] = $chunk;
            }
        }

        // Split large chunks
        $chunks = $this->splitLargeChunks($chunks);

        return $chunks;
    }

    /**
     * Creates a chunk array from content.
     *
     * @param array<string> $headingPath
     * @return array{content: string, content_clean: string, heading_path: array<string>, token_count: int}|null
     */
    private function createChunk(string $content, array $headingPath): ?array
    {
        $cleanContent = $this->cleanHtml($content);

        if (empty(trim($cleanContent))) {
            return null;
        }

        $tokenCount = $this->estimateTokens($cleanContent);

        if ($tokenCount < self::MIN_CHUNK_TOKENS) {
            return null;
        }

        return [
            'content' => trim($content),
            'content_clean' => $cleanContent,
            'heading_path' => array_values(array_filter($headingPath)),
            'token_count' => $tokenCount,
        ];
    }

    /**
     * Splits chunks that exceed the maximum token limit.
     *
     * @param array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}> $chunks
     * @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
     */
    private function splitLargeChunks(array $chunks): array
    {
        $result = [];

        foreach ($chunks as $chunk) {
            if ($chunk['token_count'] <= self::MAX_CHUNK_TOKENS) {
                $result[] = $chunk;
                continue;
            }

            // Split by paragraphs or sentences
            $paragraphs = preg_split('/\n\n+/', $chunk['content_clean']);
            if ($paragraphs === false) {
                $paragraphs = [$chunk['content_clean']];
            }

            $currentText = '';
            $currentTokens = 0;

            foreach ($paragraphs as $para) {
                $paraTokens = $this->estimateTokens($para);

                if ($currentTokens + $paraTokens > self::MAX_CHUNK_TOKENS && $currentTokens > 0) {
                    // Save current chunk
                    $result[] = [
                        'content' => $currentText,
                        'content_clean' => $currentText,
                        'heading_path' => $chunk['heading_path'],
                        'token_count' => $currentTokens,
                    ];
                    $currentText = $para;
                    $currentTokens = $paraTokens;
                } else {
                    $currentText .= ($currentText !== '' ? "\n\n" : '') . $para;
                    $currentTokens += $paraTokens;
                }
            }

            // Don't forget the last part
            if ($currentTokens >= self::MIN_CHUNK_TOKENS) {
                $result[] = [
                    'content' => $currentText,
                    'content_clean' => $currentText,
                    'heading_path' => $chunk['heading_path'],
                    'token_count' => $currentTokens,
                ];
            }
        }

        return $result;
    }

    /**
     * Cleans HTML to plain text.
     */
    private function cleanHtml(string $html): string
    {
        // Remove scripts and styles
        $html = preg_replace('/<script[^>]*>.*?<\/script>/is', '', $html) ?? $html;
        $html = preg_replace('/<style[^>]*>.*?<\/style>/is', '', $html) ?? $html;

        // Convert common elements to text equivalents
        $html = preg_replace('/<br\s*\/?>/i', "\n", $html) ?? $html;
        $html = preg_replace('/<\/p>/i', "\n\n", $html) ?? $html;
        $html = preg_replace('/<\/div>/i', "\n", $html) ?? $html;
        $html = preg_replace('/<\/li>/i', "\n", $html) ?? $html;
        $html = preg_replace('/<\/tr>/i', "\n", $html) ?? $html;

        // Preserve code blocks
        $html = preg_replace('/<pre[^>]*>(.*?)<\/pre>/is', "\n```\n$1\n```\n", $html) ?? $html;
        $html = preg_replace('/<code[^>]*>(.*?)<\/code>/is', '`$1`', $html) ?? $html;

        // Strip remaining tags
        $text = strip_tags($html);

        // Decode entities
        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');

        // Normalize whitespace
        $text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
        $text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;

        return trim($text);
    }

    /**
     * Estimates token count for text.
     */
    private function estimateTokens(string $text): int
    {
        return (int) ceil(mb_strlen($text) / self::CHARS_PER_TOKEN);
    }

    /**
     * Stores a chunk in the database.
     *
     * @param array{content: string, content_clean: string, heading_path: array<string>, token_count: int} $chunk
     */
    private function storeChunk(int $docId, int $index, array $chunk): void
    {
        $sql = "INSERT INTO dokumentation_chunks
                (dokumentation_id, chunk_index, content, content_clean, token_count, heading_path, analysis_status)
                VALUES (:doc_id, :idx, :content, :clean, :tokens, :heading, 'pending')";

        $stmt = $this->pdo->prepare($sql);
        $stmt->execute([
            'doc_id' => $docId,
            'idx' => $index,
            'content' => $chunk['content'],
            'clean' => $chunk['content_clean'],
            'tokens' => $chunk['token_count'],
            'heading' => json_encode($chunk['heading_path']),
        ]);
    }

    /**
     * Deletes all chunks for a document.
     */
    private function deleteChunksForDocument(int $docId): void
    {
        $stmt = $this->pdo->prepare('DELETE FROM dokumentation_chunks WHERE dokumentation_id = :doc_id');
        $stmt->execute(['doc_id' => $docId]);
    }

    /**
     * Gets statistics about chunks.
     *
     * @return array{total_chunks: int, total_tokens: int, pending_analysis: int, completed_analysis: int}
     */
    public function getStats(): array
    {
        $stmt = $this->pdo->query("
            SELECT
                COUNT(*) as total_chunks,
                COALESCE(SUM(token_count), 0) as total_tokens,
                SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending_analysis,
                SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed_analysis
            FROM dokumentation_chunks
        ");

        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return [
            'total_chunks' => (int) ($result['total_chunks'] ?? 0),
            'total_tokens' => (int) ($result['total_tokens'] ?? 0),
            'pending_analysis' => (int) ($result['pending_analysis'] ?? 0),
            'completed_analysis' => (int) ($result['completed_analysis'] ?? 0),
        ];
    }

    private function createConnection(): PDO
    {
        $password = $this->getPassword();

        return new PDO(
            'mysql:host=localhost;dbname=ki_dev;charset=utf8mb4',
            'root',
            $password,
            [
                PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
                PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
            ]
        );
    }

    private function getPassword(): string
    {
        $file = '/var/www/docs/credentials/credentials.md';
        $content = file_get_contents($file);

        if ($content === false) {
            return '';
        }

        foreach (explode("\n", $content) as $line) {
            if (str_contains($line, 'MariaDB') && str_contains($line, 'root')) {
                $parts = explode('|', $line);
                if (count($parts) >= 4) {
                    return trim($parts[3]);
                }
            }
        }

        return '';
    }
}

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
858 5 modified 11.6 KB 2025-12-23 08:46
857 4 modified 11.7 KB 2025-12-23 08:46
784 3 modified 11.8 KB 2025-12-23 08:05
35 2 modified 12.6 KB 2025-12-20 17:23
29 1 modified 12.6 KB 2025-12-20 17:18

← Zurück zur Übersicht