Backup #323
| ID | 323 |
| Dateipfad | /var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkSyncService.php |
| Version | 4 |
| Typ |
modified |
| Größe | 16.3 KB |
| Hash | 1f5c6bb67843f3259467bac91217c6c1a1d533243b745db18beb25f1bd531110 |
| Datum | 2025-12-22 08:08:33 |
| Geändert von | claude-code-hook |
| Grund | Claude Code Pre-Hook Backup vor Edit-Operation |
| Datei existiert |
Ja
|
Dateiinhalt
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
use Infrastructure\AI\OllamaService;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
use RuntimeException;
/**
* Service for synchronizing documentation chunks to Qdrant vector database.
*
* Syncs analyzed chunks with their embeddings and metadata to enable
* semantic search with structured filtering.
*/
final class ChunkSyncService
{
private const string COLLECTION = 'dokumentation_chunks';
private const string QDRANT_HOST = 'http://localhost:6333';
private const int VECTOR_SIZE = 1024;
private const int TIMEOUT = 60;
private const int BATCH_SIZE = 10;
private PDO $pdo;
private OllamaService $ollama;
public function __construct()
{
$this->ollama = new OllamaService();
$this->pdo = $this->createConnection();
}
/**
* Ensures the Qdrant collection exists with proper configuration.
*/
public function ensureCollection(): bool
{
// Check if collection exists
$url = sprintf('%s/collections/%s', self::QDRANT_HOST, self::COLLECTION);
try {
$response = $this->makeRequest($url, [], 'GET');
if (isset($response['result'])) {
return true; // Collection exists
}
} catch (RuntimeException) {
// Collection doesn't exist, create it
}
// Create collection
$payload = [
'vectors' => [
'size' => self::VECTOR_SIZE,
'distance' => 'Cosine',
],
];
try {
$this->makeRequest($url, $payload, 'PUT');
return true;
} catch (RuntimeException $e) {
throw new RuntimeException('Failed to create collection: ' . $e->getMessage());
}
}
/**
* Syncs a single chunk to Qdrant.
*/
public function syncChunk(int $chunkId): bool
{
$chunk = $this->getChunk($chunkId);
if ($chunk === null) {
return false;
}
// Only sync completed analyses
if ($chunk['analysis_status'] !== 'completed') {
return false;
}
// Get document context
$doc = $this->getDocument((int) $chunk['dokumentation_id']);
// Prepare text for embedding
$text = $this->prepareTextForEmbedding($chunk, $doc);
// Generate embedding
$embedding = $this->ollama->getEmbedding($text);
// Build payload with metadata
$payload = $this->buildPayload($chunk, $doc);
// Generate UUID for Qdrant if not exists
$qdrantId = $chunk['qdrant_id'] ?? $this->generateUuid();
// Upsert to Qdrant
$success = $this->upsertPoint($qdrantId, $embedding, $payload);
if ($success && $chunk['qdrant_id'] === null) {
$this->updateQdrantId($chunkId, $qdrantId);
}
return $success;
}
/**
* Syncs all analyzed chunks that haven't been synced yet.
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAllPending(int $limit = 100): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getUnsyncedChunks($limit);
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: Sync failed";
}
} catch (RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Syncs all chunks (re-sync).
*
* @return array{synced: int, failed: int, errors: array<string>}
*/
public function syncAll(): array
{
$this->ensureCollection();
$results = ['synced' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getAllAnalyzedChunks();
foreach ($chunks as $chunk) {
try {
if ($this->syncChunk((int) $chunk['id'])) {
$results['synced']++;
if ($results['synced'] % self::BATCH_SIZE === 0) {
echo "Synced {$results['synced']} chunks...\n";
}
} else {
$results['failed']++;
}
} catch (RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Searches for similar chunks using semantic search.
*
* @param array<string, mixed>|null $filter Optional filter for taxonomy/entities
* @return array<array{id: int, doc_id: int, path: string, title: string, content: string, score: float, taxonomy: array<string>, entities: array<mixed>}>
*/
public function search(string $query, int $limit = 5, ?array $filter = null): array
{
$embedding = $this->ollama->getEmbedding($query);
$url = sprintf('%s/collections/%s/points/search', self::QDRANT_HOST, self::COLLECTION);
$payload = [
'vector' => array_values($embedding),
'limit' => $limit,
'with_payload' => true,
];
if ($filter !== null) {
$payload['filter'] = $filter;
}
$response = $this->makeRequest($url, $payload, 'POST');
if (!isset($response['result']) || !is_array($response['result'])) {
return [];
}
return array_map(static function (array $item): array {
$payload = $item['payload'] ?? [];
return [
'id' => (int) ($payload['chunk_id'] ?? 0),
'doc_id' => (int) ($payload['doc_id'] ?? 0),
'path' => (string) ($payload['path'] ?? ''),
'title' => (string) ($payload['title'] ?? ''),
'content' => (string) ($payload['content_preview'] ?? ''),
'score' => (float) ($item['score'] ?? 0),
'taxonomy' => is_array($payload['taxonomy'] ?? null) ? $payload['taxonomy'] : [],
'entities' => is_array($payload['entities'] ?? null) ? $payload['entities'] : [],
];
}, $response['result']);
}
/**
* Searches with taxonomy filter.
*
* @return array<array<string, mixed>>
*/
public function searchByTaxonomy(string $query, string $category, int $limit = 5): array
{
$filter = [
'must' => [
[
'key' => 'taxonomy_category',
'match' => ['value' => $category],
],
],
];
return $this->search($query, $limit, $filter);
}
/**
* Gets collection statistics.
*
* @return array{points_count: int, status: string}|null
*/
public function getStats(): ?array
{
$url = sprintf('%s/collections/%s', self::QDRANT_HOST, self::COLLECTION);
try {
$response = $this->makeRequest($url, [], 'GET');
if (!isset($response['result'])) {
return null;
}
return [
'points_count' => (int) ($response['result']['points_count'] ?? 0),
'status' => (string) ($response['result']['status'] ?? 'unknown'),
];
} catch (RuntimeException) {
return null;
}
}
/**
* Prepares text for embedding.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
*/
private function prepareTextForEmbedding(array $chunk, array $doc): string
{
$parts = [];
// Document context
$parts[] = 'Dokument: ' . ($doc['title'] ?? '');
// Heading path
$headingPath = json_decode($chunk['heading_path'] ?? '[]', true) ?: [];
if (!empty($headingPath)) {
$parts[] = 'Abschnitt: ' . implode(' > ', $headingPath);
}
// Taxonomy
$taxonomy = json_decode($chunk['taxonomy_path'] ?? '[]', true) ?: [];
if (!empty($taxonomy)) {
$parts[] = 'Kategorie: ' . implode(' > ', $taxonomy);
}
// Keywords
$keywords = json_decode($chunk['keywords'] ?? '[]', true) ?: [];
if (!empty($keywords)) {
$parts[] = 'Keywords: ' . implode(', ', $keywords);
}
// Main content - sanitize and limit
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
if (mb_strlen($content) > 1000) {
$content = mb_substr($content, 0, 1000) . '...';
}
$parts[] = 'Inhalt: ' . $content;
$text = implode("\n\n", $parts);
// Final safety limit for embedding model context
if (mb_strlen($text) > 1800) {
$text = mb_substr($text, 0, 1800) . '...';
}
return $text;
}
/**
* Sanitizes text for embedding by removing problematic characters.
*/
private function sanitizeForEmbedding(string $text): string
{
// Remove box-drawing and other problematic Unicode characters
$text = preg_replace('/[\x{2500}-\x{257F}]/u', ' ', $text) ?? $text; // Box Drawing
$text = preg_replace('/[\x{2580}-\x{259F}]/u', ' ', $text) ?? $text; // Block Elements
$text = preg_replace('/[\x{25A0}-\x{25FF}]/u', ' ', $text) ?? $text; // Geometric Shapes
// Remove control characters except newlines and tabs
$text = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $text) ?? $text;
// Normalize whitespace
$text = preg_replace('/[ \t]+/', ' ', $text) ?? $text;
$text = preg_replace('/\n{3,}/', "\n\n", $text) ?? $text;
// Ensure valid UTF-8
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
return trim($text);
}
/**
* Builds the Qdrant payload.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $doc
* @return array<string, mixed>
*/
private function buildPayload(array $chunk, array $doc): array
{
$content = $chunk['content_clean'] ?? $chunk['content'] ?? '';
$content = $this->sanitizeForEmbedding($content);
$preview = mb_strlen($content) > 300 ? mb_substr($content, 0, 300) . '...' : $content;
return [
'chunk_id' => (int) $chunk['id'],
'doc_id' => (int) $chunk['dokumentation_id'],
'chunk_index' => (int) $chunk['chunk_index'],
'path' => $doc['path'] ?? '',
'title' => $doc['title'] ?? '',
'content_preview' => $preview,
'heading_path' => json_decode($chunk['heading_path'] ?? '[]', true) ?: [],
'taxonomy_category' => $chunk['taxonomy_category'] ?? null,
'taxonomy' => json_decode($chunk['taxonomy_path'] ?? '[]', true) ?: [],
'entities' => json_decode($chunk['entities'] ?? '[]', true) ?: [],
'keywords' => json_decode($chunk['keywords'] ?? '[]', true) ?: [],
'token_count' => (int) ($chunk['token_count'] ?? 0),
];
}
/**
* Upserts a point to Qdrant.
*
* @param array<int, float> $vector
* @param array<string, mixed> $payload
*/
private function upsertPoint(string $id, array $vector, array $payload): bool
{
$url = sprintf('%s/collections/%s/points', self::QDRANT_HOST, self::COLLECTION);
$data = [
'points' => [
[
'id' => $id,
'vector' => array_values($vector),
'payload' => $payload,
],
],
];
try {
$this->makeRequest($url, $data, 'PUT');
return true;
} catch (RuntimeException) {
return false;
}
}
/**
* Updates the qdrant_id in the database.
*/
private function updateQdrantId(int $chunkId, string $qdrantId): void
{
$stmt = $this->pdo->prepare('UPDATE dokumentation_chunks SET qdrant_id = :qid WHERE id = :id');
$stmt->execute(['id' => $chunkId, 'qid' => $qdrantId]);
}
/**
* Gets a chunk by ID.
*
* @return array<string, mixed>|null
*/
private function getChunk(int $id): ?array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/**
* Gets a document by ID.
*
* @return array<string, mixed>
*/
private function getDocument(int $id): array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : [];
}
/**
* Gets unsynced chunks (analyzed but not in Qdrant).
*
* @return array<array<string, mixed>>
*/
private function getUnsyncedChunks(int $limit): array
{
$stmt = $this->pdo->prepare("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed' AND qdrant_id IS NULL
ORDER BY dokumentation_id, chunk_index
LIMIT :limit
");
$stmt->bindValue('limit', $limit, PDO::PARAM_INT);
$stmt->execute();
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Gets all analyzed chunks.
*
* @return array<array<string, mixed>>
*/
private function getAllAnalyzedChunks(): array
{
$stmt = $this->pdo->query("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'completed'
ORDER BY dokumentation_id, chunk_index
");
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Generates a UUID v4.
*/
private function generateUuid(): string
{
$data = random_bytes(16);
$data[6] = chr((ord($data[6]) & 0x0f) | 0x40);
$data[8] = chr((ord($data[8]) & 0x3f) | 0x80);
return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($data), 4));
}
/**
* Makes an HTTP request to Qdrant.
*
* @param array<string, mixed> $payload
* @return array<string, mixed>
*/
private function makeRequest(string $url, array $payload, string $method): array
{
$ch = curl_init($url);
if ($ch === false) {
throw new RuntimeException('Failed to initialize cURL');
}
$options = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => self::TIMEOUT,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
];
if ($method === 'GET') {
... (42 weitere Zeilen)
Vollständig herunterladen
Aktionen
Andere Versionen dieser Datei
| ID |
Version |
Typ |
Größe |
Datum |
| 1999 |
17 |
modified |
9.2 KB |
2025-12-28 14:01 |
| 1856 |
16 |
modified |
9.2 KB |
2025-12-27 23:46 |
| 1854 |
15 |
modified |
9.2 KB |
2025-12-27 23:46 |
| 1849 |
14 |
modified |
9.2 KB |
2025-12-27 23:45 |
| 1281 |
13 |
modified |
16.3 KB |
2025-12-25 13:01 |
| 852 |
12 |
modified |
16.4 KB |
2025-12-23 08:46 |
| 851 |
11 |
modified |
16.5 KB |
2025-12-23 08:46 |
| 785 |
10 |
modified |
16.6 KB |
2025-12-23 08:05 |
| 406 |
9 |
modified |
16.4 KB |
2025-12-22 08:54 |
| 364 |
8 |
modified |
16.4 KB |
2025-12-22 08:23 |
| 363 |
7 |
modified |
16.3 KB |
2025-12-22 08:23 |
| 325 |
6 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 324 |
5 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 323 |
4 |
modified |
16.3 KB |
2025-12-22 08:08 |
| 322 |
3 |
modified |
16.2 KB |
2025-12-22 08:08 |
| 37 |
2 |
modified |
17.0 KB |
2025-12-20 17:23 |
| 28 |
1 |
modified |
17.0 KB |
2025-12-20 17:18 |
← Zurück zur Übersicht