Backup #1856

ID1856
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkSyncService.php
Version16
Typ modified
Größe9.2 KB
Hash96ba1f6a4db26090819120ce495f6f49351149f88e0b55ae225569d602b27646
Datum2025-12-27 23:46:59
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

// @responsibility: Synchronisiert Chunks mit Embeddings nach Qdrant

use Domain\Constants;
use Infrastructure\AI\OllamaService;
use Infrastructure\AI\QdrantClient;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;

final class ChunkSyncService
{
    use JsonDecodeTrait;

    private const string COLLECTION = 'dokumentation_chunks';
    private const int VECTOR_SIZE = 1024;
    private const int BATCH_SIZE = 10;

    public function __construct(
        private PDO $pdo,
        private OllamaService $ollama,
        private QdrantClient $qdrant
    ) {
    }

    /**
     * Ensures the Qdrant collection exists with proper configuration.
     */
    public function ensureCollection(): bool
    {
        return $this->qdrant->ensureCollection(self::COLLECTION, self::VECTOR_SIZE);
    }

    /**
     * Syncs a single chunk to Qdrant.
     */
    public function syncChunk(int $chunkId): bool
    {
        $chunk = $this->getChunk($chunkId);

        if ($chunk === null) {
            return false;
        }

        if ($chunk['analysis_status'] !== 'completed') {
            return false;
        }

        $doc = $this->getDocument((int) $chunk['dokumentation_id']);
        $text = $this->prepareTextForEmbedding($chunk, $doc);
        $embedding = $this->ollama->getEmbedding($text);
        $payload = $this->buildPayload($chunk, $doc);
        $qdrantId = $chunk['qdrant_id'] ?? $this->qdrant->generateUuid();

        $success = $this->qdrant->upsertPoint(self::COLLECTION, $qdrantId, $embedding, $payload);

        if ($success && $chunk['qdrant_id'] === null) {
            $this->updateQdrantId($chunkId, $qdrantId);
        }

        return $success;
    }

    /**
     * Syncs all analyzed chunks that haven't been synced yet.
     *
     * @return array{synced: int, failed: int, errors: array<string>}
     */
    public function syncAllPending(int $limit = Constants::DEFAULT_LIMIT): array
    {
        $this->ensureCollection();

        $results = ['synced' => 0, 'failed' => 0, 'errors' => []];
        $chunks = $this->getUnsyncedChunks($limit);

        foreach ($chunks as $chunk) {
            try {
                if ($this->syncChunk((int) $chunk['id'])) {
                    $results['synced']++;

                    if ($results['synced'] % self::BATCH_SIZE === 0) {
                        echo "Synced {$results['synced']} chunks...\n";
                    }
                } else {
                    $results['failed']++;
                    $results['errors'][] = "Chunk #{$chunk['id']}: Sync failed";
                }
            } catch (\RuntimeException $e) {
                $results['failed']++;
                $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
            }
        }

        return $results;
    }

    /**
     * Syncs all chunks (re-sync).
     *
     * @return array{synced: int, failed: int, errors: array<string>}
     */
    public function syncAll(): array
    {
        $this->ensureCollection();

        $results = ['synced' => 0, 'failed' => 0, 'errors' => []];
        $chunks = $this->getAllAnalyzedChunks();

        foreach ($chunks as $chunk) {
            try {
                if ($this->syncChunk((int) $chunk['id'])) {
                    $results['synced']++;

                    if ($results['synced'] % self::BATCH_SIZE === 0) {
                        echo "Synced {$results['synced']} chunks...\n";
                    }
                } else {
                    $results['failed']++;
                }
            } catch (\RuntimeException $e) {
                $results['failed']++;
                $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
            }
        }

        return $results;
    }

    /**
     * Gets collection statistics.
     *
     * @return array{points_count: int, status: string}|null
     */
    public function getStats(): ?array
    {
        return $this->qdrant->getCollectionStats(self::COLLECTION);
    }

    /**
     * Prepares text for embedding.
     *
     * @param array<string, mixed> $chunk
     * @param array<string, mixed> $doc
     */
    private function prepareTextForEmbedding(array $chunk, array $doc): string
    {
        $parts = [];

        $parts[] = 'Dokument: ' . ($doc['title'] ?? '');

        $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
        if (!empty($headingPath)) {
            $parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);
        }

        $taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);
        if (!empty($taxonomy)) {
            $parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);
        }

        $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);
        if (!empty($keywords)) {
            $parts[] = 'Keywords: ' . implode(', ', $keywords);
        }

        $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
        $content = $this->sanitizeForEmbedding($content);
        if (mb_strlen($content) > 1000) {
            $content = mb_substr($content, 0, 1000) . '...';
        }
        $parts[] = 'Inhalt: ' . $content;

        $text = implode("\n\n", $parts);

        if (mb_strlen($text) > 1800) {
            $text = mb_substr($text, 0, 1800) . '...';
        }

        return $text;
    }

    /**
     * Sanitizes text for embedding by removing problematic characters.
     */
    private function sanitizeForEmbedding(string $text): string
    {
        $text = preg_replace('/[\x{2500}-\x{257F}]/u', ' ', $text) ?? $text;
        $text = preg_replace('/[\x{2580}-\x{259F}]/u', ' ', $text) ?? $text;
        $text = preg_replace('/[\x{25A0}-\x{25FF}]/u', ' ', $text) ?? $text;
        $text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text) ?? $text;
        $text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
        $text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
        $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');

        return trim($text);
    }

    /**
     * Builds the Qdrant payload.
     *
     * @param array<string, mixed> $chunk
     * @param array<string, mixed> $doc
     * @return array<string, mixed>
     */
    private function buildPayload(array $chunk, array $doc): array
    {
        $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
        $content = $this->sanitizeForEmbedding($content);
        $preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;

        return [
            'chunk_id' => (int) $chunk['id'],
            'doc_id' => (int) $chunk['dokumentation_id'],
            'chunk_index' => (int) $chunk['chunk_index'],
            'path' => $doc['path'] ?? '',
            'title' => $doc['title'] ?? '',
            'content_preview' => $preview,
            'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),
            'taxonomy_category' => $chunk['taxonomy_category'] ?? null,
            'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),
            'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),
            'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),
            'token_count' => (int) ($chunk['token_count'] ?? 0),
        ];
    }

    /**
     * Updates the qdrant_id in the database.
     */
    private function updateQdrantId(int $chunkId, string $qdrantId): void
    {
        $stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');
        $stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);
    }

    /**
     * Gets a chunk by ID.
     *
     * @return array<string, mixed>|null
     */
    private function getChunk(int $id): ?array
    {
        $stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
        $stmt->execute(['id' => $id]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }

    /**
     * Gets a document by ID.
     *
     * @return array<string, mixed>
     */
    private function getDocument(int $id): array
    {
        $stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');
        $stmt->execute(['id' => $id]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : [];
    }

    /**
     * Gets unsynced chunks (analyzed but not in Qdrant).
     *
     * @return array<array<string, mixed>>
     */
    private function getUnsyncedChunks(int $limit): array
    {
        $stmt = $this->pdo->prepare("
            SELECT * FROM dokumentation_chunks
            WHERE analysis_status = 'completed' AND qdrant_id IS NULL
            ORDER BY dokumentation_id, chunk_index
            LIMIT :limit
        ");
        $stmt->bindValue('limit', $limit, PDO::PARAM_INT);
        $stmt->execute();

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * Gets all analyzed chunks.
     *
     * @return array<array<string, mixed>>
     */
    private function getAllAnalyzedChunks(): array
    {
        $stmt = $this->pdo->query("
            SELECT * FROM dokumentation_chunks
            WHERE analysis_status = 'completed'
            ORDER BY dokumentation_id, chunk_index
        ");

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }
}

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
1999 17 modified 9.2 KB 2025-12-28 14:01
1856 16 modified 9.2 KB 2025-12-27 23:46
1854 15 modified 9.2 KB 2025-12-27 23:46
1849 14 modified 9.2 KB 2025-12-27 23:45
1281 13 modified 16.3 KB 2025-12-25 13:01
852 12 modified 16.4 KB 2025-12-23 08:46
851 11 modified 16.5 KB 2025-12-23 08:46
785 10 modified 16.6 KB 2025-12-23 08:05
406 9 modified 16.4 KB 2025-12-22 08:54
364 8 modified 16.4 KB 2025-12-22 08:23
363 7 modified 16.3 KB 2025-12-22 08:23
325 6 modified 16.3 KB 2025-12-22 08:08
324 5 modified 16.3 KB 2025-12-22 08:08
323 4 modified 16.3 KB 2025-12-22 08:08
322 3 modified 16.2 KB 2025-12-22 08:08
37 2 modified 17.0 KB 2025-12-20 17:23
28 1 modified 17.0 KB 2025-12-20 17:18

← Zurück zur Übersicht