Backup #1856
| ID | 1856 |
| Dateipfad | /var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkSyncService.php |
| Version | 16 |
| Typ |
modified |
| Größe | 9.2 KB |
| Hash | 96ba1f6a4db26090819120ce495f6f49351149f88e0b55ae225569d602b27646 |
| Datum | 2025-12-27 23:46:59 |
| Geändert von | claude-code-hook |
| Grund | Claude Code Pre-Hook Backup vor Edit-Operation |
| Datei existiert |
Ja
|
Dateiinhalt
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
// @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant
use Domain\Constants;
use Infrastructure\AI\OllamaService;
use Infrastructure\AI\QdrantClient;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
final class ChunkSyncService
{
use JsonDecodeTrait;
private const string COLLECTION = 'dokumentation_chunks';
private const int VECTOR_SIZE = 1024;
private const int BATCH_SIZE = 10;
public function __construct(
private PDO $pdo,
private OllamaService $ollama,
private QdrantClient $qdrant
) {
}
/**
* Ensures the Qdrant collection exists with proper configuration.
*/
public function ensureCollection(): bool
{
return $this->qdrant->ensureCollection(self::COLLECTION, self::VECTOR_SIZE);
}
/**
* Syncs a single chunk to Qdrant.
*/
public function syncChunk(int $chunkId): bool
{
$chunk = $this->getChunk($chunkId);
if ($chunk === null) {
return false;
}
if ($chunk['analysis_status'] !== 'completed') {
return false;
}
$doc = $this->getDocument((int) $chunk['dokumentation_id']);
$text = $this->prepareTextForEmbedding($chunk, $doc);
$embedding = $this->ollama->getEmbedding($text);
$payload = $this->buildPayload($chunk, $doc);
$qdrantId = $chunk['qdrant_id'] ?? $this->qdrant->generateUuid();
$success = $this->qdrant->upsertPoint(self::COLLECTION, $qdrantId, $embedding, $payload);
if ($success && $chunk['qdrant_id'] === null) {
$this->updateQdrantId($chunkId, $qdrantId);
}
return $success;
}
/**
* Syncs all analyzed chunks that haven't been synced yet.
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAllPending(int $limit = Constants::DEFAULT_LIMIT): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getUnsyncedChunks($limit);
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: Sync failed";
}
} catch (\RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Syncs all chunks (re-sync).
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAll(): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getAllAnalyzedChunks();
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
}
} catch (\RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Gets collection statistics.
*
* @return array{points_count: int, status: string}|null
*/
public function getStats(): ?array
{
return $this->qdrant->getCollectionStats(self::COLLECTION);
}
/**
* Prepares text for embedding.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
*/
private function prepareTextForEmbedding(array $chunk, array $doc): string
{
$parts = [];
$parts[] = 'Dokument: ' . ($doc['title'] ?? '');
$headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
if (!empty($headingPath)) {
$parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);
}
$taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);
if (!empty($taxonomy)) {
$parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);
}
$keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);
if (!empty($keywords)) {
$parts[] = 'Keywords: ' . implode(', ', $keywords);
}
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
if (mb_strlen($content) > 1000) {
$content = mb_substr($content, 0, 1000) . '...';
}
$parts[] = 'Inhalt: ' . $content;
$text = implode("\n\n", $parts);
if (mb_strlen($text) > 1800) {
$text = mb_substr($text, 0, 1800) . '...';
}
return $text;
}
/**
* Sanitizes text for embedding by removing problematic characters.
*/
private function sanitizeForEmbedding(string $text): string
{
$text = preg_replace('/[\x{2500}-\x{257F}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x{2580}-\x{259F}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x{25A0}-\x{25FF}]/u', ' ', $text) ?? $text;
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text) ?? $text;
$text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
$text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
return trim($text);
}
/**
* Builds the Qdrant payload.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
* @return array<string, mixed>
*/
private function buildPayload(array $chunk, array $doc): array
{
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
$preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;
return [
'chunk_id' => (int) $chunk['id'],
'doc_id' => (int) $chunk['dokumentation_id'],
'chunk_index' => (int) $chunk['chunk_index'],
'path' => $doc['path'] ?? '',
'title' => $doc['title'] ?? '',
'content_preview' => $preview,
'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),
'taxonomy_category' => $chunk['taxonomy_category'] ?? null,
'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),
'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),
'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),
'token_count' => (int) ($chunk['token_count'] ?? 0),
];
}
/**
* Updates the qdrant_id in the database.
*/
private function updateQdrantId(int $chunkId, string $qdrantId): void
{
$stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');
$stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);
}
/**
* Gets a chunk by ID.
*
* @return array<string, mixed>|null
*/
private function getChunk(int $id): ?array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/**
* Gets a document by ID.
*
* @return array<string, mixed>
*/
private function getDocument(int $id): array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : [];
}
/**
* Gets unsynced chunks (analyzed but not in Qdrant).
*
* @return array<array<string, mixed>>
*/
private function getUnsyncedChunks(int $limit): array
{
$stmt = $this->pdo->prepare("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed' AND qdrant_id IS NULL
ORDER BY dokumentation_id, chunk_index
LIMIT :limit
");
$stmt->bindValue('limit', $limit, PDO::PARAM_INT);
$stmt->execute();
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Gets all analyzed chunks.
*
* @return array<array<string, mixed>>
*/
private function getAllAnalyzedChunks(): array
{
$stmt = $this->pdo->query("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed'
ORDER BY dokumentation_id, chunk_index
");
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
}
Vollständig herunterladen
Aktionen
Andere Versionen dieser Datei
| ID |
Version |
Typ |
Größe |
Datum |
| 1999 |
17 |
modified |
9.2 KB |
2025-12-28 14:01 |
| 1856 |
16 |
modified |
9.2 KB |
2025-12-27 23:46 |
| 1854 |
15 |
modified |
9.2 KB |
2025-12-27 23:46 |
| 1849 |
14 |
modified |
9.2 KB |
2025-12-27 23:45 |
| 1281 |
13 |
modified |
16.3 KB |
2025-12-25 13:01 |
| 852 |
12 |
modified |
16.4 KB |
2025-12-23 08:46 |
| 851 |
11 |
modified |
16.5 KB |
2025-12-23 08:46 |
| 785 |
10 |
modified |
16.6 KB |
2025-12-23 08:05 |
| 406 |
9 |
modified |
16.4 KB |
2025-12-22 08:54 |
| 364 |
8 |
modified |
16.4 KB |
2025-12-22 08:23 |
| 363 |
7 |
modified |
16.3 KB |
2025-12-22 08:23 |
| 325 |
6 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 324 |
5 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 323 |
4 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 322 |
3 |
modified |
16.2 KB |
2025-12-22 08:08 |
| 37 |
2 |
modified |
17.0 KB |
2025-12-20 17:23 |
| 28 |
1 |
modified |
17.0 KB |
2025-12-20 17:18 |
← Zurück zur Übersicht