Backup #35
| ID | 35 |
| Dateipfad | /var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkingService.php |
| Version | 2 |
| Typ |
modified |
| Größe | 12.6 KB |
| Hash | 8edc64eb1f8d8fd0d06bed62e40506be1fbe9c92a4448292c30d45b68f02dce7 |
| Datum | 2025-12-20 17:23:58 |
| Geändert von | claude-code-hook |
| Grund | Claude Code Pre-Hook Backup vor Edit-Operation |
| Datei existiert |
Ja
|
Dateiinhalt
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
use Infrastructure\Persistence\DokumentationRepository;
use PDO;
use RuntimeException;
/**
* Service for chunking documentation into smaller, embeddable segments.
*
* Splits HTML documentation into structural chunks based on headings,
* preserving hierarchy and context for better semantic embeddings.
*/
final class ChunkingService
{
private const int MAX_CHUNK_TOKENS = 400;
private const int MIN_CHUNK_TOKENS = 50;
private const float CHARS_PER_TOKEN = 4.0;
private PDO $pdo;
private DokumentationRepository $repo;
public function __construct()
{
$this->repo = new DokumentationRepository();
$this->pdo = $this->createConnection();
}
/**
* Chunks a single document and stores in database.
*
* @return array{chunks_created: int, tokens_total: int}
*/
public function chunkDocument(int $docId): array
{
$doc = $this->repo->findById($docId);
if ($doc === null) {
throw new RuntimeException("Document #{$docId} not found");
}
// Delete existing chunks for this document
$this->deleteChunksForDocument($docId);
// Parse and chunk the content
$chunks = $this->parseHtmlToChunks($doc['content'], $doc['title']);
// Store chunks
$tokensTotal = 0;
foreach ($chunks as $index => $chunk) {
$this->storeChunk($docId, $index, $chunk);
$tokensTotal += $chunk['token_count'];
}
return [
'chunks_created' => count($chunks),
'tokens_total' => $tokensTotal,
];
}
/**
* Chunks all documents in the hierarchy.
*
* @return array{documents: int, chunks: int, tokens: int, errors: array<string>}
*/
public function chunkAll(): array
{
$hierarchy = $this->repo->getHierarchy();
$results = ['documents' => 0, 'chunks' => 0, 'tokens' => 0, 'errors' => []];
$this->processHierarchy($hierarchy, $results);
return $results;
}
/**
* @param array<array<string, mixed>> $items
* @param array{documents: int, chunks: int, tokens: int, errors: array<string>} $results
*/
private function processHierarchy(array $items, array &$results): void
{
foreach ($items as $item) {
try {
$result = $this->chunkDocument((int) $item['id']);
$results['documents']++;
$results['chunks'] += $result['chunks_created'];
$results['tokens'] += $result['tokens_total'];
} catch (RuntimeException $e) {
$results['errors'][] = "Doc #{$item['id']}: " . $e->getMessage();
}
if (!empty($item['children'])) {
$this->processHierarchy($item['children'], $results);
}
}
}
/**
* Parses HTML content into structured chunks.
*
* @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
*/
private function parseHtmlToChunks(string $html, string $docTitle): array
{
$chunks = [];
$currentHeadingPath = [$docTitle];
// Strip PHP code if present
$html = preg_replace('/<\?php.*?\?>/s', '', $html) ?? $html;
// Split by headings (h1-h4)
$pattern = '/(<h[1-4][^>]*>.*?<\/h[1-4]>)/is';
$parts = preg_split($pattern, $html, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
if ($parts === false) {
$parts = [$html];
}
$currentContent = '';
foreach ($parts as $part) {
// Check if this is a heading
if (preg_match('/<h([1-4])[^>]*>(.*?)<\/h[1-4]>/is', $part, $matches)) {
// Save previous content as chunk if substantial
if (!empty(trim($currentContent))) {
$chunk = $this->createChunk($currentContent, $currentHeadingPath);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// Update heading path based on level
$level = (int) $matches[1];
$headingText = strip_tags($matches[2]);
$headingText = html_entity_decode($headingText, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$headingText = trim($headingText);
// Adjust heading path based on level
$currentHeadingPath = array_slice($currentHeadingPath, 0, $level);
$currentHeadingPath[$level] = $headingText;
$currentContent = '';
} else {
$currentContent .= $part;
}
}
// Don't forget the last chunk
if (!empty(trim($currentContent))) {
$chunk = $this->createChunk($currentContent, $currentHeadingPath);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// If no chunks created, create one from the whole content
if (empty($chunks)) {
$chunk = $this->createChunk($html, [$docTitle]);
if ($chunk !== null) {
$chunks[] = $chunk;
}
}
// Split large chunks
$chunks = $this->splitLargeChunks($chunks);
return $chunks;
}
/**
* Creates a chunk array from content.
*
* @param array<string> $headingPath
* @return array{content: string, content_clean: string, heading_path: array<string>, token_count: int}|null
*/
private function createChunk(string $content, array $headingPath): ?array
{
$cleanContent = $this->cleanHtml($content);
if (empty(trim($cleanContent))) {
return null;
}
$tokenCount = $this->estimateTokens($cleanContent);
if ($tokenCount < self::MIN_CHUNK_TOKENS) {
return null;
}
return [
'content' => trim($content),
'content_clean' => $cleanContent,
'heading_path' => array_values(array_filter($headingPath)),
'token_count' => $tokenCount,
];
}
/**
* Splits chunks that exceed the maximum token limit.
*
* @param array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}> $chunks
* @return array<array{content: string, content_clean: string, heading_path: array<string>, token_count: int}>
*/
private function splitLargeChunks(array $chunks): array
{
$result = [];
foreach ($chunks as $chunk) {
if ($chunk['token_count'] <= self::MAX_CHUNK_TOKENS) {
$result[] = $chunk;
continue;
}
// Split by paragraphs or sentences
$paragraphs = preg_split('/\n\n+/', $chunk['content_clean']);
if ($paragraphs === false) {
$paragraphs = [$chunk['content_clean']];
}
$currentText = '';
$currentTokens = 0;
foreach ($paragraphs as $para) {
$paraTokens = $this->estimateTokens($para);
if ($currentTokens + $paraTokens > self::MAX_CHUNK_TOKENS && $currentTokens > 0) {
// Save current chunk
$result[] = [
'content' => $currentText,
'content_clean' => $currentText,
'heading_path' => $chunk['heading_path'],
'token_count' => $currentTokens,
];
$currentText = $para;
$currentTokens = $paraTokens;
} else {
$currentText .= ($currentText !== '' ? "\n\n" : '') . $para;
$currentTokens += $paraTokens;
}
}
// Don't forget the last part
if ($currentTokens >= self::MIN_CHUNK_TOKENS) {
$result[] = [
'content' => $currentText,
'content_clean' => $currentText,
'heading_path' => $chunk['heading_path'],
'token_count' => $currentTokens,
];
}
}
return $result;
}
/**
* Cleans HTML to plain text.
*/
private function cleanHtml(string $html): string
{
// Remove scripts and styles
$html = preg_replace('/<script[^>]*>.*?<\/script>/is', '', $html) ?? $html;
$html = preg_replace('/<style[^>]*>.*?<\/style>/is', '', $html) ?? $html;
// Convert common elements to text equivalents
$html = preg_replace('/<br\s*\/?>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/p>/i', "\n\n", $html) ?? $html;
$html = preg_replace('/<\/div>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/li>/i', "\n", $html) ?? $html;
$html = preg_replace('/<\/tr>/i', "\n", $html) ?? $html;
// Preserve code blocks
$html = preg_replace('/<pre[^>]*>(.*?)<\/pre>/is', "\n```\n$1\n```\n", $html) ?? $html;
$html = preg_replace('/<code[^>]*>(.*?)<\/code>/is', '`$1`', $html) ?? $html;
// Strip remaining tags
$text = strip_tags($html);
// Decode entities
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
// Normalize whitespace
$text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
$text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
return trim($text);
}
/**
* Estimates token count for text.
*/
private function estimateTokens(string $text): int
{
return (int) ceil(mb_strlen($text) / self::CHARS_PER_TOKEN);
}
/**
* Stores a chunk in the database.
*
* @param array{content: string, content_clean: string, heading_path: array<string>, token_count: int} $chunk
*/
private function storeChunk(int $docId, int $index, array $chunk): void
{
$sql = "INSERT INTO dokumentation_chunks
(dokumentation_id, chunk_index, content, content_clean, token_count, heading_path, analysis_status)
VALUES (:doc_id, :idx, :content, :clean, :tokens, :heading, 'pending')";
$stmt = $this->pdo->prepare($sql);
$stmt->execute([
'doc_id' => $docId,
'idx' => $index,
'content' => $chunk['content'],
'clean' => $chunk['content_clean'],
'tokens' => $chunk['token_count'],
'heading' => json_encode($chunk['heading_path']),
]);
}
/**
* Deletes all chunks for a document.
*/
private function deleteChunksForDocument(int $docId): void
{
$stmt = $this->pdo->prepare('DELETE FROM dokumentation_chunks WHERE dokumentation_id = :doc_id');
$stmt->execute(['doc_id' => $docId]);
}
/**
* Gets statistics about chunks.
*
* @return array{total_chunks: int, total_tokens: int, pending_analysis: int, completed_analysis: int}
*/
public function getStats(): array
{
$stmt = $this->pdo->query("
SELECT
COUNT(*) as total_chunks,
COALESCE(SUM(token_count), 0) as total_tokens,
SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending_analysis,
SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed_analysis
FROM dokumentation_chunks
");
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return [
'total_chunks' => (int) ($result['total_chunks'] ?? 0),
'total_tokens' => (int) ($result['total_tokens'] ?? 0),
'pending_analysis' => (int) ($result['pending_analysis'] ?? 0),
'completed_analysis' => (int) ($result['completed_analysis'] ?? 0),
];
}
private function createConnection(): PDO
{
$password = $this->getPassword();
return new PDO(
'mysql:host=localhost;dbname=ki_dev;charset=utf8mb4',
'root',
$password,
[
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
]
);
}
private function getPassword(): string
{
$file = '/var/www/docs/credentials/credentials.md';
$content = file_get_contents($file);
if ($content === false) {
return '';
}
foreach (explode("\n", $content) as $line) {
if (str_contains($line, 'MariaDB') && str_contains($line, 'root')) {
$parts = explode('|', $line);
if (count($parts) >= 4) {
return trim($parts[3]);
}
}
}
return '';
}
}
Vollständig herunterladen
Aktionen
Andere Versionen dieser Datei
| ID |
Version |
Typ |
Größe |
Datum |
| 858 |
5 |
modified |
11.6 KB |
2025-12-23 08:46 |
| 857 |
4 |
modified |
11.7 KB |
2025-12-23 08:46 |
| 784 |
3 |
modified |
11.8 KB |
2025-12-23 08:05 |
| 35 |
2 |
modified |
12.6 KB |
2025-12-20 17:23 |
| 29 |
1 |
modified |
12.6 KB |
2025-12-20 17:18 |
← Zurück zur Übersicht