ChunkSyncService.php
- Pfad:
src/Infrastructure/Docs/ChunkSyncService.php - Namespace: Infrastructure\Docs
- Zeilen: 303 | Größe: 9,528 Bytes
- Geändert: 2025-12-28 14:01:44 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 77
- Dependencies: 100 (25%)
- LOC: 23 (20%)
- Methods: 60 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 9
- trait Infrastructure\Traits\JsonDecodeTrait
- constructor PDO
- constructor Infrastructure\AI\OllamaService
- constructor Infrastructure\AI\QdrantClient
- use Domain\Constants
- use Infrastructure\AI\OllamaService
- use Infrastructure\AI\QdrantClient
- use Infrastructure\Traits\JsonDecodeTrait
- use PDO
Klassen 1
-
ChunkSyncServiceclass Zeile 15
Funktionen 14
-
__construct()public Zeile 23 -
ensureCollection()public Zeile 33 -
syncChunk()public Zeile 41 -
syncAllPending()public Zeile 73 -
syncAll()Zeile 106 -
getStats()Zeile 138 -
prepareTextForEmbedding()Zeile 149 -
sanitizeForEmbedding()Zeile 189 -
buildPayload()Zeile 209 -
updateQdrantId()Zeile 234 -
getChunk()Zeile 245 -
getDocument()Zeile 259 -
getUnsyncedChunks()Zeile 273 -
getAllAnalyzedChunks()Zeile 292
Verwendet von 2
- Doc2VectorPipeline.php constructor
- InfrastructureServiceProvider.php use
Versionen 17
-
v17
2025-12-28 14:01 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v16
2025-12-27 23:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v15
2025-12-27 23:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v14
2025-12-27 23:45 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v13
2025-12-25 13:01 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Write-Operation -
v12
2025-12-23 08:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v11
2025-12-23 08:46 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v10
2025-12-23 08:05 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v9
2025-12-22 08:54 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v8
2025-12-22 08:23 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v7
2025-12-22 08:23 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v6
2025-12-22 08:08 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v5
2025-12-22 08:08 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v4
2025-12-22 08:08 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v3
2025-12-22 08:08 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v2
2025-12-20 17:23 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v1
2025-12-20 17:18 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
// @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant
use Domain\Constants;
use Infrastructure\AI\OllamaService;
use Infrastructure\AI\QdrantClient;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
final class ChunkSyncService
{
use JsonDecodeTrait;
private const string COLLECTION = 'dokumentation_chunks';
private const int VECTOR_SIZE = 1024;
private const int BATCH_SIZE = 10;
public function __construct(
private PDO $pdo,
private OllamaService $ollama,
private QdrantClient $qdrant
) {
}
/**
* Ensures the Qdrant collection exists with proper configuration.
*/
public function ensureCollection(): bool
{
return $this->qdrant->ensureCollection(self::COLLECTION, self::VECTOR_SIZE);
}
/**
* Syncs a single chunk to Qdrant.
*/
public function syncChunk(int $chunkId): bool
{
$chunk = $this->getChunk($chunkId);
if ($chunk === null) {
return false;
}
if ($chunk['analysis_status'] !== 'completed') {
return false;
}
$doc = $this->getDocument((int) $chunk['dokumentation_id']);
$text = $this->prepareTextForEmbedding($chunk, $doc);
$embedding = $this->ollama->getEmbedding($text);
$payload = $this->buildPayload($chunk, $doc);
$qdrantId = $chunk['qdrant_id'] ?? $this->qdrant->generateUuid();
$success = $this->qdrant->upsertPoint(self::COLLECTION, $qdrantId, $embedding, $payload);
if ($success && $chunk['qdrant_id'] === null) {
$this->updateQdrantId($chunkId, $qdrantId);
}
return $success;
}
/**
* Syncs all analyzed chunks that haven't been synced yet.
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAllPending(int $limit = Constants::DEFAULT_LIMIT): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getUnsyncedChunks($limit);
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: Sync failed";
}
} catch (\RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Syncs all chunks (re-sync).
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAll(): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getAllAnalyzedChunks();
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
}
} catch (\RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Gets collection statistics.
*
* @return array{points_count: int, status: string}|null
*/
public function getStats(): ?array
{
return $this->qdrant->getCollectionStats(self::COLLECTION);
}
/**
* Prepares text for embedding.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
*/
private function prepareTextForEmbedding(array $chunk, array $doc): string
{
$parts = [];
$parts[] = 'Dokument: ' . ($doc['title'] ?? '');
$headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
if (!empty($headingPath)) {
$parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);
}
$taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);
if (!empty($taxonomy)) {
$parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);
}
$keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);
if (!empty($keywords)) {
$parts[] = 'Keywords: ' . implode(', ', $keywords);
}
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
if (mb_strlen($content) > Constants::EMBEDDING_TEXT_LIMIT) {
$content = mb_substr($content, 0, Constants::EMBEDDING_TEXT_LIMIT) . '...';
}
$parts[] = 'Inhalt: ' . $content;
$text = implode("\n\n", $parts);
if (mb_strlen($text) > Constants::EMBEDDING_PAYLOAD_LIMIT) {
$text = mb_substr($text, 0, Constants::EMBEDDING_PAYLOAD_LIMIT) . '...';
}
return $text;
}
/**
* Sanitizes text for embedding by removing problematic characters.
*/
private function sanitizeForEmbedding(string $text): string
{
$text = preg_replace('/[\x{2500}-\x{257F}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x{2580}-\x{259F}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x{25A0}-\x{25FF}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text) ?? $text;
$text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
$text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
return trim($text);
}
/**
* Builds the Qdrant payload.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
* @return array<string, mixed>
*/
private function buildPayload(array $chunk, array $doc): array
{
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
$preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;
return [
'chunk_id' => (int) $chunk['id'],
'doc_id' => (int) $chunk['dokumentation_id'],
'chunk_index' => (int) $chunk['chunk_index'],
'path' => $doc['path'] ?? '',
'title' => $doc['title'] ?? '',
'content_preview' => $preview,
'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),
'taxonomy_category' => $chunk['taxonomy_category'] ?? null,
'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),
'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),
'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),
'token_count' => (int) ($chunk['token_count'] ?? 0),
];
}
/**
* Updates the qdrant_id in the database.
*/
private function updateQdrantId(int $chunkId, string $qdrantId): void
{
$stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');
$stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);
}
/**
* Gets a chunk by ID.
*
* @return array<string, mixed>|null
*/
private function getChunk(int $id): ?array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/**
* Gets a document by ID.
*
* @return array<string, mixed>
*/
private function getDocument(int $id): array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : [];
}
/**
* Gets unsynced chunks (analyzed but not in Qdrant).
*
* @return array<array<string, mixed>>
*/
private function getUnsyncedChunks(int $limit): array
{
$stmt = $this->pdo->prepare("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed' AND qdrant_id IS NULL
ORDER BY dokumentation_id, chunk_index
LIMIT :limit
");
$stmt->bindValue('limit', $limit, PDO::PARAM_INT);
$stmt->execute();
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Gets all analyzed chunks.
*
* @return array<array<string, mixed>>
*/
private function getAllAnalyzedChunks(): array
{
$stmt = $this->pdo->query("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed'
ORDER BY dokumentation_id, chunk_index
");
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
}