ChunkingService.php
- Pfad:
src/Infrastructure/Docs/ChunkingService.php - Namespace: Infrastructure\Docs
- Zeilen: 350 | Größe: 11,733 Bytes
- Geändert: 2025-12-23 08:49:25 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 76
- Dependencies: 100 (25%)
- LOC: 0 (20%)
- Methods: 80 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 5
- constructor PDO
- constructor Infrastructure\Persistence\DokumentationRepository
- use Infrastructure\Persistence\DokumentationRepository
- use PDO
- use RuntimeException
Klassen 1
-
ChunkingServiceclass Zeile 13
Funktionen 12
-
__construct()public Zeile 19 -
chunkDocument()public Zeile 30 -
chunkAll()Zeile 62 -
processHierarchy()Zeile 76 -
parseHtmlToChunks()Zeile 99 -
createChunk()Zeile 172 -
splitLargeChunks()Zeile 200 -
cleanHtml()Zeile 255 -
estimateTokens()Zeile 288 -
storeChunk()Zeile 298 -
deleteChunksForDocument()Zeile 318 -
getStats()Zeile 329
Verwendet von 2
- Doc2VectorPipeline.php constructor
- InfrastructureServiceProvider.php use
Versionen 5
-
v5
2025-12-23 08:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v4
2025-12-23 08:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v3
2025-12-23 08:05 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v2
2025-12-20 17:23 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v1
2025-12-20 17:18 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
// @responsibility: Zerlegt Dokumentation in embedding-fähige Chunks
use Infrastructure\Persistence\DokumentationRepository;
use PDO;
use RuntimeException;
final class ChunkingService
{
private const int MAX_CHUNK_TOKENS = 400;
private const int MIN_CHUNK_TOKENS = 50;
private const float CHARS_PER_TOKEN = 4.0;
public function __construct(
private PDO $pdo,
private DokumentationRepository $repo
) {
}
/**
* Chunks a single document and stores in database.
*
* @return array{chunks_created: int, tokens_total: int}
*/
public function chunkDocument(int $docId): array
{
$doc = $this->repo->findById($docId);
if ($doc === null) {
throw new RuntimeException("Document #{$docId} not found");
}
// Delete existing chunks for this document
$this->deleteChunksForDocument($docId);
// Parse and chunk the content
$chunks = $this->parseHtmlToChunks($doc['content'], $doc['title']);
// Store chunks
$tokensTotal = 0;
foreach ($chunks as $index => $chunk) {
$this->storeChunk($docId, $index, $chunk);
$tokensTotal += $chunk['token_count'];
}
return [
'chunks_created' => count($chunks),
'tokens_total' => $tokensTotal,
];
}
/**
* Chunks all documents in the hierarchy.
*
* @return array{documents: int, chunks: int, tokens: int, errors: array<string>}
*/
public function chunkAll(): array
{
$hierarchy = $this->repo->getHierarchy();
$results = ['documents' => 0, 'chunks' => 0, 'tokens' => 0, 'errors' => []];
$this->processHierarchy($hierarchy, $results);
return $results;
}
/**
* @param array<array<string, mixed>> $items
* @param array{documents: int, chunks: int, tokens: int, errors: array<string>} $results
*/
private function processHierarchy(array $items, array &$results): void
{
foreach ($items as $item) {
try {
$result = $this->chunkDocument((int) $item['id']);
$results['documents']++;
$results['chunks'] += $result['chunks_created'];
$results['tokens'] += $result['tokens_total'];
} catch (RuntimeException $e) {
$results['errors'][] = "Doc #{$item['id']}: " . $e->getMessage();
}
if (!empty($item['children'])) {
$this->processHierarchy($item['children'], $results);
}
}
}
/**
* Parses HTML content into structured chunks.
*
* @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
*/
private function parseHtmlToChunks(string $html, string $docTitle): array
{
$chunks = [];
$currentHeadingPath = [$docTitle];
// Strip PHP code if present
$html = preg_replace('/<\?php.*?\?>/s', '', $html) ?? $html;
// Split by headings (h1-h4)
$pattern = '/(<h[1-4][^>]*>.*?<\/h[1-4]>)/is';
$parts = preg_split($pattern, $html, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
if ($parts === false) {
$parts = [$html];
}
$currentContent = '';
foreach ($parts as $part) {
// Check if this is a heading
if (preg_match('/<h([1-4])[^>]*>(.*?)<\/h[1-4]>/is', $part, $matches)) {
// Save previous content as chunk if substantial
if (!empty(trim($currentContent))) {
$chunk = $this->createChunk($currentContent, $currentHeadingPath);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// Update heading path based on level
$level = (int) $matches[1];
$headingText = strip_tags($matches[2]);
$headingText = html_entity_decode($headingText, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$headingText = trim($headingText);
// Adjust heading path based on level
$currentHeadingPath = array_slice($currentHeadingPath, 0, $level);
$currentHeadingPath[$level] = $headingText;
$currentContent = '';
} else {
$currentContent .= $part;
}
}
// Don't forget the last chunk
if (!empty(trim($currentContent))) {
$chunk = $this->createChunk($currentContent, $currentHeadingPath);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// If no chunks created, create one from the whole content
if (empty($chunks)) {
$chunk = $this->createChunk($html, [$docTitle]);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// Split large chunks
$chunks = $this->splitLargeChunks($chunks);
return $chunks;
}
/**
* Creates a chunk array from content.
*
* @param array<string> $headingPath
* @return array{content: string, content_clean: string, heading_path: array<string>, token_count: int}|null
*/
private function createChunk(string $content, array $headingPath): ?array
{
$cleanContent = $this->cleanHtml($content);
if (empty(trim($cleanContent))) {
return null;
}
$tokenCount = $this->estimateTokens($cleanContent);
if ($tokenCount < self::MIN_CHUNK_TOKENS) {
return null;
}
return [
'content' => trim($content),
'content_clean' => $cleanContent,
'heading_path' => array_values(array_filter($headingPath)),
'token_count' => $tokenCount,
];
}
/**
* Splits chunks that exceed the maximum token limit.
*
* @param array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}> $chunks
* @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
*/
private function splitLargeChunks(array $chunks): array
{
$result = [];
foreach ($chunks as $chunk) {
if ($chunk['token_count'] <= self::MAX_CHUNK_TOKENS) {
$result[] = $chunk;
continue;
}
// Split by paragraphs or sentences
$paragraphs = preg_split('/\n\n+/', $chunk['content_clean']);
if ($paragraphs === false) {
$paragraphs = [$chunk['content_clean']];
}
$currentText = '';
$currentTokens = 0;
foreach ($paragraphs as $para) {
$paraTokens = $this->estimateTokens($para);
if ($currentTokens + $paraTokens > self::MAX_CHUNK_TOKENS && $currentTokens > 0) {
// Save current chunk
$result[] = [
'content' => $currentText,
'content_clean' => $currentText,
'heading_path' => $chunk['heading_path'],
'token_count' => $currentTokens,
];
$currentText = $para;
$currentTokens = $paraTokens;
} else {
$currentText .= ($currentText !== '' ? "\n\n" : '') . $para;
$currentTokens += $paraTokens;
}
}
// Don't forget the last part
if ($currentTokens >= self::MIN_CHUNK_TOKENS) {
$result[] = [
'content' => $currentText,
'content_clean' => $currentText,
'heading_path' => $chunk['heading_path'],
'token_count' => $currentTokens,
];
}
}
return $result;
}
/**
* Cleans HTML to plain text.
*/
private function cleanHtml(string $html): string
{
// Remove scripts and styles
$html = preg_replace('/<script[^>]*>.*?<\/script>/is', '', $html) ?? $html;
$html = preg_replace('/<style[^>]*>.*?<\/style>/is', '', $html) ?? $html;
// Convert common elements to text equivalents
$html = preg_replace('/<br\s*\/?>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/p>/i', "\n\n", $html) ?? $html;
$html = preg_replace('/<\/div>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/li>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/tr>/i', "\n", $html) ?? $html;
// Preserve code blocks
$html = preg_replace('/<pre[^>]*>(.*?)<\/pre>/is', "\n```\n$1\n```\n", $html) ?? $html;
$html = preg_replace('/<code[^>]*>(.*?)<\/code>/is', '`$1`', $html) ?? $html;
// Strip remaining tags
$text = strip_tags($html);
// Decode entities
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// Normalize whitespace
$text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
$text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
return trim($text);
}
/**
* Estimates token count for text.
*/
private function estimateTokens(string $text): int
{
return (int) ceil(mb_strlen($text) / self::CHARS_PER_TOKEN);
}
/**
* Stores a chunk in the database.
*
* @param array{content: string, content_clean: string, heading_path: array<string>, token_count: int} $chunk
*/
private function storeChunk(int $docId, int $index, array $chunk): void
{
$sql = "INSERT INTO dokumentation_chunks
(dokumentation_id, chunk_index, content, content_clean, token_count, heading_path, analysis_status)
VALUES (:doc_id, :idx, :content, :clean, :tokens, :heading, 'pending')";
$stmt = $this->pdo->prepare($sql);
$stmt->execute([
'doc_id' => $docId,
'idx' => $index,
'content' => $chunk['content'],
'clean' => $chunk['content_clean'],
'tokens' => $chunk['token_count'],
'heading' => json_encode($chunk['heading_path']),
]);
}
/**
* Deletes all chunks for a document.
*/
private function deleteChunksForDocument(int $docId): void
{
$stmt = $this->pdo->prepare('DELETE FROM dokumentation_chunks WHERE dokumentation_id = :doc_id');
$stmt->execute(['doc_id' => $docId]);
}
/**
* Gets statistics about chunks.
*
* @return array{total_chunks: int, total_tokens: int, pending_analysis: int, completed_analysis: int}
*/
public function getStats(): array
{
$stmt = $this->pdo->query("
SELECT
COUNT(*) as total_chunks,
COALESCE(SUM(token_count), 0) as total_tokens,
SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending_analysis,
SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed_analysis
FROM dokumentation_chunks
");
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return [
'total_chunks' => (int) ($result['total_chunks'] ?? 0),
'total_tokens' => (int) ($result['total_tokens'] ?? 0),
'pending_analysis' => (int) ($result['pending_analysis'] ?? 0),
'completed_analysis' => (int) ($result['completed_analysis'] ?? 0),
];
}
}