repo->findById($docId); if ($doc === null) { throw new RuntimeException("Document #{$docId} not found"); } // Delete existing chunks for this document $this->deleteChunksForDocument($docId); // Parse and chunk the content $chunks = $this->parseHtmlToChunks($doc['content'], $doc['title']); // Store chunks $tokensTotal = 0; foreach ($chunks as $index => $chunk) { $this->storeChunk($docId, $index, $chunk); $tokensTotal += $chunk['token_count']; } return [ 'chunks_created' => count($chunks), 'tokens_total' => $tokensTotal, ]; } /** * Chunks all documents in the hierarchy. * * @return array{documents: int, chunks: int, tokens: int, errors: array} */ public function chunkAll(): array { $hierarchy = $this->repo->getHierarchy(); $results = ['documents' => 0, 'chunks' => 0, 'tokens' => 0, 'errors' => []]; $this->processHierarchy($hierarchy, $results); return $results; } /** * @param array> $items * @param array{documents: int, chunks: int, tokens: int, errors: array} $results */ private function processHierarchy(array $items, array &$results): void { foreach ($items as $item) { try { $result = $this->chunkDocument((int) $item['id']); $results['documents']++; $results['chunks'] += $result['chunks_created']; $results['tokens'] += $result['tokens_total']; } catch (RuntimeException $e) { $results['errors'][] = "Doc #{$item['id']}: " . $e->getMessage(); } if (!empty($item['children'])) { $this->processHierarchy($item['children'], $results); } } } /** * Parses HTML content into structured chunks. * * @return array, token_count: int}> */ private function parseHtmlToChunks(string $html, string $docTitle): array { $chunks = []; $currentHeadingPath = [$docTitle]; // Strip PHP code if present $html = preg_replace('/<\?php.*?\?>/s', '', $html) ?? $html; // Split by headings (h1-h4) $pattern = '/(]*>.*?<\/h[1-4]>)/is'; $parts = preg_split($pattern, $html, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); if ($parts === false) { $parts = [$html]; } $currentContent = ''; foreach ($parts as $part) { // Check if this is a heading if (preg_match('/]*>(.*?)<\/h[1-4]>/is', $part, $matches)) { // Save previous content as chunk if substantial if (!empty(trim($currentContent))) { $chunk = $this->createChunk($currentContent, $currentHeadingPath); if ($chunk !== null) { $chunks[] = $chunk; } } // Update heading path based on level $level = (int) $matches[1]; $headingText = strip_tags($matches[2]); $headingText = html_entity_decode($headingText, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $headingText = trim($headingText); // Adjust heading path based on level $currentHeadingPath = array_slice($currentHeadingPath, 0, $level); $currentHeadingPath[$level] = $headingText; $currentContent = ''; } else { $currentContent .= $part; } } // Don't forget the last chunk if (!empty(trim($currentContent))) { $chunk = $this->createChunk($currentContent, $currentHeadingPath); if ($chunk !== null) { $chunks[] = $chunk; } } // If no chunks created, create one from the whole content if (empty($chunks)) { $chunk = $this->createChunk($html, [$docTitle]); if ($chunk !== null) { $chunks[] = $chunk; } } // Split large chunks $chunks = $this->splitLargeChunks($chunks); return $chunks; } /** * Creates a chunk array from content. * * @param array $headingPath * @return array{content: string, content_clean: string, heading_path: array, token_count: int}|null */ private function createChunk(string $content, array $headingPath): ?array { $cleanContent = $this->cleanHtml($content); if (empty(trim($cleanContent))) { return null; } $tokenCount = $this->estimateTokens($cleanContent); if ($tokenCount < self::MIN_CHUNK_TOKENS) { return null; } return [ 'content' => trim($content), 'content_clean' => $cleanContent, 'heading_path' => array_values(array_filter($headingPath)), 'token_count' => $tokenCount, ]; } /** * Splits chunks that exceed the maximum token limit. * * @param array, token_count: int}> $chunks * @return array, token_count: int}> */ private function splitLargeChunks(array $chunks): array { $result = []; foreach ($chunks as $chunk) { if ($chunk['token_count'] <= self::MAX_CHUNK_TOKENS) { $result[] = $chunk; continue; } // Split by paragraphs or sentences $paragraphs = preg_split('/\n\n+/', $chunk['content_clean']); if ($paragraphs === false) { $paragraphs = [$chunk['content_clean']]; } $currentText = ''; $currentTokens = 0; foreach ($paragraphs as $para) { $paraTokens = $this->estimateTokens($para); if ($currentTokens + $paraTokens > self::MAX_CHUNK_TOKENS && $currentTokens > 0) { // Save current chunk $result[] = [ 'content' => $currentText, 'content_clean' => $currentText, 'heading_path' => $chunk['heading_path'], 'token_count' => $currentTokens, ]; $currentText = $para; $currentTokens = $paraTokens; } else { $currentText .= ($currentText !== '' ? "\n\n" : '') . $para; $currentTokens += $paraTokens; } } // Don't forget the last part if ($currentTokens >= self::MIN_CHUNK_TOKENS) { $result[] = [ 'content' => $currentText, 'content_clean' => $currentText, 'heading_path' => $chunk['heading_path'], 'token_count' => $currentTokens, ]; } } return $result; } /** * Cleans HTML to plain text. */ private function cleanHtml(string $html): string { // Remove scripts and styles $html = preg_replace('/]*>.*?<\/script>/is', '', $html) ?? $html; $html = preg_replace('/]*>.*?<\/style>/is', '', $html) ?? $html; // Convert common elements to text equivalents $html = preg_replace('//i', "\n", $html) ?? $html; $html = preg_replace('/<\/p>/i', "\n\n", $html) ?? $html; $html = preg_replace('/<\/div>/i', "\n", $html) ?? $html; $html = preg_replace('/<\/li>/i', "\n", $html) ?? $html; $html = preg_replace('/<\/tr>/i', "\n", $html) ?? $html; // Preserve code blocks $html = preg_replace('/]*>(.*?)<\/pre>/is', "\n```\n$1\n```\n", $html) ?? $html; $html = preg_replace('/]*>(.*?)<\/code>/is', '`$1`', $html) ?? $html; // Strip remaining tags $text = strip_tags($html); // Decode entities $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); // Normalize whitespace $text = preg_replace('/[ \t]+/', ' ', $text) ?? $text; $text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text; return trim($text); } /** * Estimates token count for text. */ private function estimateTokens(string $text): int { return (int) ceil(mb_strlen($text) / self::CHARS_PER_TOKEN); } /** * Stores a chunk in the database. * * @param array{content: string, content_clean: string, heading_path: array, token_count: int} $chunk */ private function storeChunk(int $docId, int $index, array $chunk): void { $sql = "INSERT INTO dokumentation_chunks (dokumentation_id, chunk_index, content, content_clean, token_count, heading_path, analysis_status) VALUES (:doc_id, :idx, :content, :clean, :tokens, :heading, 'pending')"; $stmt = $this->pdo->prepare($sql); $stmt->execute([ 'doc_id' => $docId, 'idx' => $index, 'content' => $chunk['content'], 'clean' => $chunk['content_clean'], 'tokens' => $chunk['token_count'], 'heading' => json_encode($chunk['heading_path']), ]); } /** * Deletes all chunks for a document. */ private function deleteChunksForDocument(int $docId): void { $stmt = $this->pdo->prepare('DELETE FROM dokumentation_chunks WHERE dokumentation_id = :doc_id'); $stmt->execute(['doc_id' => $docId]); } /** * Gets statistics about chunks. * * @return array{total_chunks: int, total_tokens: int, pending_analysis: int, completed_analysis: int} */ public function getStats(): array { $stmt = $this->pdo->query(" SELECT COUNT(*) as total_chunks, COALESCE(SUM(token_count), 0) as total_tokens, SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending_analysis, SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed_analysis FROM dokumentation_chunks "); $result = $stmt->fetch(PDO::FETCH_ASSOC); return [ 'total_chunks' => (int) ($result['total_chunks'] ?? 0), 'total_tokens' => (int) ($result['total_tokens'] ?? 0), 'pending_analysis' => (int) ($result['pending_analysis'] ?? 0), 'completed_analysis' => (int) ($result['completed_analysis'] ?? 0), ]; } private function createConnection(): PDO { return \Infrastructure\Config\DatabaseFactory::dev(); } }