Backup #363

ID363
Dateipfad/var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkSyncService.php
Version7
Typ modified
Größe16.3 KB
Hash945a5092b34b6fff1e582d775722804097b19cc4901f758e233d4d318c16a00e
Datum2025-12-22 08:23:53
Geändert vonclaude-code-hook
GrundClaude Code Pre-Hook Backup vor Edit-Operation
Datei existiert Ja

Dateiinhalt

<?php

declare(strict_types=1);

namespace Infrastructure\Docs;

use Infrastructure\AI\OllamaService;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
use RuntimeException;

/**
 * Service for synchronizing documentation chunks to Qdrant vector database.
 *
 * Syncs analyzed chunks with their embeddings and metadata to enable
 * semantic search with structured filtering.
 */
final class ChunkSyncService
{
    use JsonDecodeTrait;

    private const string COLLECTION = 'dokumentation_chunks';
    private const string QDRANT_HOST = 'http://localhost:6333';
    private const int VECTOR_SIZE = 1024;
    private const int TIMEOUT = 60;
    private const int BATCH_SIZE = 10;

    private PDO $pdo;
    private OllamaService $ollama;

    public function __construct()
    {
        $this->ollama = new OllamaService();
        $this->pdo = $this->createConnection();
    }

    /**
     * Ensures the Qdrant collection exists with proper configuration.
     */
    public function ensureCollection(): bool
    {
        // Check if collection exists
        $url = sprintf('%s/collections/%s', self::QDRANT_HOST, self::COLLECTION);

        try {
            $response = $this->makeRequest($url, [], 'GET');
            if (isset($response['result'])) {
                return true; // Collection exists
            }
        } catch (RuntimeException) {
            // Collection doesn't exist, create it
        }

        // Create collection
        $payload = [
            'vectors' => [
                'size' => self::VECTOR_SIZE,
                'distance' => 'Cosine',
            ],
        ];

        try {
            $this->makeRequest($url, $payload, 'PUT');

            return true;
        } catch (RuntimeException $e) {
            throw new RuntimeException('Failed to create collection: ' . $e->getMessage());
        }
    }

    /**
     * Syncs a single chunk to Qdrant.
     */
    public function syncChunk(int $chunkId): bool
    {
        $chunk = $this->getChunk($chunkId);

        if ($chunk === null) {
            return false;
        }

        // Only sync completed analyses
        if ($chunk['analysis_status'] !== 'completed') {
            return false;
        }

        // Get document context
        $doc = $this->getDocument((int) $chunk['dokumentation_id']);

        // Prepare text for embedding
        $text = $this->prepareTextForEmbedding($chunk, $doc);

        // Generate embedding
        $embedding = $this->ollama->getEmbedding($text);

        // Build payload with metadata
        $payload = $this->buildPayload($chunk, $doc);

        // Generate UUID for Qdrant if not exists
        $qdrantId = $chunk['qdrant_id'] ?? $this->generateUuid();

        // Upsert to Qdrant
        $success = $this->upsertPoint($qdrantId, $embedding, $payload);

        if ($success && $chunk['qdrant_id'] === null) {
            $this->updateQdrantId($chunkId, $qdrantId);
        }

        return $success;
    }

    /**
     * Syncs all analyzed chunks that haven't been synced yet.
     *
     * @return array{synced: int, failed: int, errors: array<string>}
     */
    public function syncAllPending(int $limit = 100): array
    {
        $this->ensureCollection();

        $results = ['synced' => 0, 'failed' => 0, 'errors' => []];

        $chunks = $this->getUnsyncedChunks($limit);

        foreach ($chunks as $chunk) {
            try {
                if ($this->syncChunk((int) $chunk['id'])) {
                    $results['synced']++;

                    if ($results['synced'] % self::BATCH_SIZE === 0) {
                        echo "Synced {$results['synced']} chunks...\n";
                    }
                } else {
                    $results['failed']++;
                    $results['errors'][] = "Chunk #{$chunk['id']}: Sync failed";
                }
            } catch (RuntimeException $e) {
                $results['failed']++;
                $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
            }
        }

        return $results;
    }

    /**
     * Syncs all chunks (re-sync).
     *
     * @return array{synced: int, failed: int, errors: array<string>}
     */
    public function syncAll(): array
    {
        $this->ensureCollection();

        $results = ['synced' => 0, 'failed' => 0, 'errors' => []];

        $chunks = $this->getAllAnalyzedChunks();

        foreach ($chunks as $chunk) {
            try {
                if ($this->syncChunk((int) $chunk['id'])) {
                    $results['synced']++;

                    if ($results['synced'] % self::BATCH_SIZE === 0) {
                        echo "Synced {$results['synced']} chunks...\n";
                    }
                } else {
                    $results['failed']++;
                }
            } catch (RuntimeException $e) {
                $results['failed']++;
                $results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
            }
        }

        return $results;
    }

    /**
     * Searches for similar chunks using semantic search.
     *
     * @param array<string, mixed>|null $filter Optional filter for taxonomy/entities
     * @return array<array{id: int, doc_id: int, path: string, title: string, content: string, score: float, taxonomy: array<string>, entities: array<mixed>}>
     */
    public function search(string $query, int $limit = 5, ?array $filter = null): array
    {
        $embedding = $this->ollama->getEmbedding($query);

        $url = sprintf('%s/collections/%s/points/search', self::QDRANT_HOST, self::COLLECTION);

        $payload = [
            'vector' => array_values($embedding),
            'limit' => $limit,
            'with_payload' => true,
        ];

        if ($filter !== null) {
            $payload['filter'] = $filter;
        }

        $response = $this->makeRequest($url, $payload, 'POST');

        if (!isset($response['result']) || !is_array($response['result'])) {
            return [];
        }

        return array_map(static function (array $item): array {
            $payload = $item['payload'] ?? [];

            return [
                'id' => (int) ($payload['chunk_id'] ?? 0),
                'doc_id' => (int) ($payload['doc_id'] ?? 0),
                'path' => (string) ($payload['path'] ?? ''),
                'title' => (string) ($payload['title'] ?? ''),
                'content' => (string) ($payload['content_preview'] ?? ''),
                'score' => (float) ($item['score'] ?? 0),
                'taxonomy' => is_array($payload['taxonomy'] ?? null) ? $payload['taxonomy'] : [],
                'entities' => is_array($payload['entities'] ?? null) ? $payload['entities'] : [],
            ];
        }, $response['result']);
    }

    /**
     * Searches with taxonomy filter.
     *
     * @return array<array<string, mixed>>
     */
    public function searchByTaxonomy(string $query, string $category, int $limit = 5): array
    {
        $filter = [
            'must' => [
                [
                    'key' => 'taxonomy_category',
                    'match' => ['value' => $category],
                ],
            ],
        ];

        return $this->search($query, $limit, $filter);
    }

    /**
     * Gets collection statistics.
     *
     * @return array{points_count: int, status: string}|null
     */
    public function getStats(): ?array
    {
        $url = sprintf('%s/collections/%s', self::QDRANT_HOST, self::COLLECTION);

        try {
            $response = $this->makeRequest($url, [], 'GET');

            if (!isset($response['result'])) {
                return null;
            }

            return [
                'points_count' => (int) ($response['result']['points_count'] ?? 0),
                'status' => (string) ($response['result']['status'] ?? 'unknown'),
            ];
        } catch (RuntimeException) {
            return null;
        }
    }

    /**
     * Prepares text for embedding.
     *
     * @param array<string, mixed> $chunk
     * @param array<string, mixed> $doc
     */
    private function prepareTextForEmbedding(array $chunk, array $doc): string
    {
        $parts = [];

        // Document context
        $parts[] = 'Dokument: ' . ($doc['title'] ?? '');

        // Heading path
        $headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
        if (!empty($headingPath)) {
            $parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);
        }

        // Taxonomy
        $taxonomy = $this->decodeJsonArray($chunk['taxonomy_path'] ?? null);
        if (!empty($taxonomy)) {
            $parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);
        }

        // Keywords
        $keywords = $this->decodeJsonArray($chunk['keywords'] ?? null);
        if (!empty($keywords)) {
            $parts[] = 'Keywords: ' . implode(', ', $keywords);
        }

        // Main content - sanitize and limit
        $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
        $content = $this->sanitizeForEmbedding($content);
        if (mb_strlen($content) > 1000) {
            $content = mb_substr($content, 0, 1000) . '...';
        }
        $parts[] = 'Inhalt: ' . $content;

        $text = implode("\n\n", $parts);

        // Final safety limit for embedding model context
        if (mb_strlen($text) > 1800) {
            $text = mb_substr($text, 0, 1800) . '...';
        }

        return $text;
    }

    /**
     * Sanitizes text for embedding by removing problematic characters.
     */
    private function sanitizeForEmbedding(string $text): string
    {
        // Remove box-drawing and other problematic Unicode characters
        $text = preg_replace('/[\x{2500}-\x{257F}]/u', ' ', $text) ?? $text; // Box Drawing
        $text = preg_replace('/[\x{2580}-\x{259F}]/u', ' ', $text) ?? $text; // Block Elements
        $text = preg_replace('/[\x{25A0}-\x{25FF}]/u', ' ', $text) ?? $text; // Geometric Shapes

        // Remove control characters except newlines and tabs
        $text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text) ?? $text;

        // Normalize whitespace
        $text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
        $text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;

        // Ensure valid UTF-8
        $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');

        return trim($text);
    }

    /**
     * Builds the Qdrant payload.
     *
     * @param array<string, mixed> $chunk
     * @param array<string, mixed> $doc
     * @return array<string, mixed>
     */
    private function buildPayload(array $chunk, array $doc): array
    {
        $content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
        $content = $this->sanitizeForEmbedding($content);
        $preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;

        return [
            'chunk_id' => (int) $chunk['id'],
            'doc_id' => (int) $chunk['dokumentation_id'],
            'chunk_index' => (int) $chunk['chunk_index'],
            'path' => $doc['path'] ?? '',
            'title' => $doc['title'] ?? '',
            'content_preview' => $preview,
            'heading_path' => $this->decodeJsonArray($chunk['heading_path'] ?? null),
            'taxonomy_category' => $chunk['taxonomy_category'] ?? null,
            'taxonomy' => $this->decodeJsonArray($chunk['taxonomy_path'] ?? null),
            'entities' => $this->decodeJsonArray($chunk['entities'] ?? null),
            'keywords' => $this->decodeJsonArray($chunk['keywords'] ?? null),
            'token_count' => (int) ($chunk['token_count'] ?? 0),
        ];
    }

    /**
     * Upserts a point to Qdrant.
     *
     * @param array<int, float> $vector
     * @param array<string, mixed> $payload
     */
    private function upsertPoint(string $id, array $vector, array $payload): bool
    {
        $url = sprintf('%s/collections/%s/points', self::QDRANT_HOST, self::COLLECTION);

        $data = [
            'points' => [
                [
                    'id' => $id,
                    'vector' => array_values($vector),
                    'payload' => $payload,
                ],
            ],
        ];

        try {
            $this->makeRequest($url, $data, 'PUT');

            return true;
        } catch (RuntimeException) {
            return false;
        }
    }

    /**
     * Updates the qdrant_id in the database.
     */
    private function updateQdrantId(int $chunkId, string $qdrantId): void
    {
        $stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');
        $stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);
    }

    /**
     * Gets a chunk by ID.
     *
     * @return array<string, mixed>|null
     */
    private function getChunk(int $id): ?array
    {
        $stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
        $stmt->execute(['id' => $id]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : null;
    }

    /**
     * Gets a document by ID.
     *
     * @return array<string, mixed>
     */
    private function getDocument(int $id): array
    {
        $stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');
        $stmt->execute(['id' => $id]);
        $result = $stmt->fetch(PDO::FETCH_ASSOC);

        return $result !== false ? $result : [];
    }

    /**
     * Gets unsynced chunks (analyzed but not in Qdrant).
     *
     * @return array<array<string, mixed>>
     */
    private function getUnsyncedChunks(int $limit): array
    {
        $stmt = $this->pdo->prepare("
            SELECT * FROM dokumentation_chunks
            WHERE analysis_status = 'completed' AND qdrant_id IS NULL
            ORDER BY dokumentation_id, chunk_index
            LIMIT :limit
        ");
        $stmt->bindValue('limit', $limit, PDO::PARAM_INT);
        $stmt->execute();

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * Gets all analyzed chunks.
     *
     * @return array<array<string, mixed>>
     */
    private function getAllAnalyzedChunks(): array
    {
        $stmt = $this->pdo->query("
            SELECT * FROM dokumentation_chunks
            WHERE analysis_status = 'completed'
            ORDER BY dokumentation_id, chunk_index
        ");

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * Generates a UUID v4.
     */
    private function generateUuid(): string
    {
        $data = random_bytes(16);
        $data[6] = chr((ord($data[6]) & 0x0f) | 0x40);
        $data[8] = chr((ord($data[8]) & 0x3f) | 0x80);

        return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($data), 4));
    }

    /**
     * Makes an HTTP request to Qdrant.
     *
     * @param array<string, mixed> $payload
     * @return array<string, mixed>
     */
    private function makeRequest(string $url, array $payload, string $method): array
    {
        $ch = curl_init($url);

        if ($ch === false) {
            throw new RuntimeException('Failed to initialize cURL');
        }

        $options = [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT => self::TIMEOUT,
            CURLOPT_CONNECTTIMEOUT => 10,
            CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
        ];

... (44 weitere Zeilen)

Vollständig herunterladen

Aktionen

Herunterladen

Andere Versionen dieser Datei

ID Version Typ Größe Datum
1999 17 modified 9.2 KB 2025-12-28 14:01
1856 16 modified 9.2 KB 2025-12-27 23:46
1854 15 modified 9.2 KB 2025-12-27 23:46
1849 14 modified 9.2 KB 2025-12-27 23:45
1281 13 modified 16.3 KB 2025-12-25 13:01
852 12 modified 16.4 KB 2025-12-23 08:46
851 11 modified 16.5 KB 2025-12-23 08:46
785 10 modified 16.6 KB 2025-12-23 08:05
406 9 modified 16.4 KB 2025-12-22 08:54
364 8 modified 16.4 KB 2025-12-22 08:23
363 7 modified 16.3 KB 2025-12-22 08:23
325 6 modified 16.3 KB 2025-12-22 08:08
324 5 modified 16.3 KB 2025-12-22 08:08
323 4 modified 16.3 KB 2025-12-22 08:08
322 3 modified 16.2 KB 2025-12-22 08:08
37 2 modified 17.0 KB 2025-12-20 17:23
28 1 modified 17.0 KB 2025-12-20 17:18

← Zurück zur Übersicht