Backup #397
| ID | 397 |
| Dateipfad | /var/www/dev.campus.systemische-tools.de/src/Infrastructure/Docs/ChunkAnalysisService.php |
| Version | 6 |
| Typ |
modified |
| Größe | 13.0 KB |
| Hash | 92b2b511952d031620f8313f8c8d69d8a032f9faebad62c9eaee0fd4dedf6802 |
| Datum | 2025-12-22 08:49:35 |
| Geändert von | claude-code-hook |
| Grund | Claude Code Pre-Hook Backup vor Edit-Operation |
| Datei existiert |
Ja
|
Dateiinhalt
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
use Infrastructure\AI\OllamaService;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
use RuntimeException;
/**
* Service for analyzing documentation chunks using local LLMs.
*
* Performs three types of analysis on each chunk:
* - Taxonomy: Hierarchical classification (e.g., Server > Security > Firewall)
* - Ontology: Entity extraction (technologies, concepts, configurations)
* - Semantics: Keywords and glossary terms
*/
final class ChunkAnalysisService
{
use JsonDecodeTrait;
private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat';
private const string ONTOLOGY_MODEL = 'gemma3:4b-it-qat';
private const int MAX_RETRIES = 3;
private const int BATCH_SIZE = 10;
private PDO $pdo;
private OllamaService $ollama;
public function __construct()
{
$this->ollama = new OllamaService();
$this->pdo = $this->createConnection();
}
/**
* Analyzes a single chunk.
*
* @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
*/
public function analyzeChunk(int $chunkId): array
{
$chunk = $this->getChunk($chunkId);
if ($chunk === null) {
throw new RuntimeException("Chunk #{$chunkId} not found");
}
// Mark as processing
$this->updateStatus($chunkId, 'processing');
try {
// Get document context
$docContext = $this->getDocumentContext((int) $chunk['dokumentation_id']);
// Perform analysis
$analysis = $this->performAnalysis($chunk, $docContext);
// Store results
$this->storeAnalysisResults($chunkId, $analysis);
return $analysis;
} catch (RuntimeException $e) {
$this->updateStatus($chunkId, 'failed', $e->getMessage());
throw $e;
}
}
/**
* Analyzes all pending chunks in batches.
*
* @return array{analyzed: int, failed: int, errors: array<string>}
*/
public function analyzeAllPending(int $limit = 100): array
{
$results = ['analyzed' => 0, 'failed' => 0, 'errors' => []];
$chunks = $this->getPendingChunks($limit);
foreach ($chunks as $chunk) {
try {
$this->analyzeChunk((int) $chunk['id']);
$results['analyzed']++;
// Progress output
if ($results['analyzed'] % self::BATCH_SIZE === 0) {
echo "Analyzed {$results['analyzed']} chunks...\n";
}
} catch (RuntimeException $e) {
$results['failed']++;
$results['errors'][] = "Chunk #{$chunk['id']}: " . $e->getMessage();
}
}
return $results;
}
/**
* Performs the actual LLM analysis.
*
* @param array<string, mixed> $chunk
* @param array<string, mixed> $docContext
* @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
*/
private function performAnalysis(array $chunk, array $docContext): array
{
$content = $chunk['content_clean'] ?? $chunk['content'];
$headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
// Build context
$context = sprintf(
"Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s",
$docContext['title'],
$docContext['path'],
implode(' > ', $headingPath),
$content
);
// Combined analysis prompt for efficiency
$prompt = $this->buildAnalysisPrompt($context);
$response = $this->callLlmWithRetry($prompt, self::TAXONOMY_MODEL);
$analysis = $this->parseAnalysisResponse($response);
// Fallback: If no taxonomy, derive from document path
if (empty($analysis['taxonomy'])) {
$analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']);
}
return $analysis;
}
/**
* Builds the analysis prompt.
*/
private function buildAnalysisPrompt(string $context): string
{
return <<<PROMPT
Analysiere den folgenden technischen Dokumentationsabschnitt und extrahiere strukturierte Informationen.
{$context}
Antworte NUR mit einem JSON-Objekt in diesem exakten Format (keine Erklärungen):
{
"taxonomy": ["Hauptkategorie", "Unterkategorie", "Thema"],
"entities": [
{"name": "Entitätsname", "type": "TECHNOLOGY|CONCEPT|CONFIG|COMMAND|SERVICE"}
],
"keywords": ["keyword1", "keyword2", "keyword3"]
}
Regeln:
- taxonomy: Hierarchische Klassifikation (3 Ebenen: Bereich > Modul > Thema)
- entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste
- keywords: 3-5 relevante Suchbegriffe
- Antworte NUR mit dem JSON, keine anderen Texte
PROMPT;
}
/**
* Calls the LLM with retry logic.
*/
private function callLlmWithRetry(string $prompt, string $model): string
{
$lastError = null;
for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) {
try {
return $this->ollama->generate($prompt, $model);
} catch (RuntimeException $e) {
$lastError = $e;
if ($attempt < self::MAX_RETRIES) {
usleep(500000 * $attempt); // Progressive backoff
}
}
}
throw new RuntimeException('LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . ($lastError?->getMessage() ?? 'Unknown error'));
}
/**
* Parses the LLM response into structured data.
*
* @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
*/
private function parseAnalysisResponse(string $response): array
{
$default = [
'taxonomy' => [],
'entities' => [],
'keywords' => [],
];
// Extract JSON from response (handle markdown code blocks)
$json = $response;
if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) {
$json = $matches[1];
} elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) {
$json = $matches[0];
}
$decoded = json_decode($json, true);
if (!is_array($decoded)) {
return $default;
}
return [
'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? [], 'string'),
'entities' => $this->validateEntities($decoded['entities'] ?? []),
'keywords' => $this->validateArray($decoded['keywords'] ?? [], 'string'),
];
}
/**
* Validates an array of strings.
*
* @param mixed $arr
* @return array<string>
*/
private function validateArray(mixed $arr, string $type): array
{
if (!is_array($arr)) {
return [];
}
return array_values(array_filter($arr, static fn ($item): bool => is_string($item) && trim($item) !== ''));
}
/**
* Validates entities array.
*
* @param mixed $entities
* @return array<array{name: string, type: string}>
*/
private function validateEntities(mixed $entities): array
{
if (!is_array($entities)) {
return [];
}
$result = [];
foreach ($entities as $entity) {
if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) {
$result[] = [
'name' => trim($entity['name']),
'type' => isset($entity['type']) && is_string($entity['type']) ? strtoupper($entity['type']) : 'OTHER',
];
}
}
return $result;
}
/**
* Derives taxonomy from document path.
*
* @return array<string>
*/
private function deriveTaxonomyFromPath(string $path): array
{
$parts = array_filter(explode('/', trim($path, '/')));
// Map common paths to categories
$mapping = [
'server' => 'Server',
'modul' => 'Module',
'anwendungen' => 'Anwendungen',
'mcp' => 'MCP-Server',
'ki-tasks' => 'KI-Tasks',
];
$taxonomy = [];
foreach ($parts as $part) {
$taxonomy[] = $mapping[$part] ?? ucfirst($part);
}
return array_slice($taxonomy, 0, 3);
}
/**
* Stores analysis results in the database.
*
* @param array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>} $analysis
*/
private function storeAnalysisResults(int $chunkId, array $analysis): void
{
$taxonomyPath = $analysis['taxonomy'];
$taxonomyCategory = !empty($taxonomyPath) ? $taxonomyPath[0] : null;
$sql = "UPDATE dokumentation_chunks SET
taxonomy_category = :category,
taxonomy_path = :taxonomy,
entities = :entities,
keywords = :keywords,
analysis_model = :model,
analysis_status = 'completed',
analysis_error = NULL,
analyzed_at = NOW()
WHERE id = :id";
$stmt = $this->pdo->prepare($sql);
$stmt->execute([
'id' => $chunkId,
'category' => $taxonomyCategory,
'taxonomy' => json_encode($taxonomyPath),
'entities' => json_encode($analysis['entities']),
'keywords' => json_encode($analysis['keywords']),
'model' => self::TAXONOMY_MODEL,
]);
}
/**
* Updates chunk status.
*/
private function updateStatus(int $chunkId, string $status, ?string $error = null): void
{
$sql = 'UPDATE dokumentation_chunks SET analysis_status = :status, analysis_error = :error WHERE id = :id';
$stmt = $this->pdo->prepare($sql);
$stmt->execute(['id' => $chunkId, 'status' => $status, 'error' => $error]);
}
/**
* Gets a chunk by ID.
*
* @return array<string, mixed>|null
*/
private function getChunk(int $id): ?array
{
$stmt = $this->pdo->prepare('SELECT * FROM dokumentation_chunks WHERE id = :id');
$stmt->execute(['id' => $id]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/**
* Gets pending chunks.
*
* @return array<array<string, mixed>>
*/
private function getPendingChunks(int $limit): array
{
$stmt = $this->pdo->prepare("
SELECT * FROM dokumentation_chunks
WHERE analysis_status = 'pending'
ORDER BY dokumentation_id, chunk_index
LIMIT :limit
");
$stmt->bindValue('limit', $limit, PDO::PARAM_INT);
$stmt->execute();
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* Gets document context.
*
* @return array{title: string, path: string}
*/
private function getDocumentContext(int $docId): array
{
$stmt = $this->pdo->prepare('SELECT title, path FROM dokumentation WHERE id = :id');
$stmt->execute(['id' => $docId]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return [
'title' => $result['title'] ?? 'Unbekannt',
'path' => $result['path'] ?? '/',
];
}
/**
* Gets analysis statistics.
*
* @return array{pending: int, processing: int, completed: int, failed: int, by_category: array<array{category: string, count: int}>}
*/
public function getStats(): array
{
$stmt = $this->pdo->query("
SELECT
SUM(CASE WHEN analysis_status = 'pending' THEN 1 ELSE 0 END) as pending,
SUM(CASE WHEN analysis_status = 'processing' THEN 1 ELSE 0 END) as processing,
SUM(CASE WHEN analysis_status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN analysis_status = 'failed' THEN 1 ELSE 0 END) as failed
FROM dokumentation_chunks
");
$counts = $stmt->fetch(PDO::FETCH_ASSOC);
$stmt = $this->pdo->query('
SELECT taxonomy_category as category, COUNT(*) as count
FROM dokumentation_chunks
WHERE taxonomy_category IS NOT NULL
GROUP BY taxonomy_category
ORDER BY count DESC
');
$byCategory = $stmt->fetchAll(PDO::FETCH_ASSOC);
return [
'pending' => (int) ($counts['pending'] ?? 0),
'processing' => (int) ($counts['processing'] ?? 0),
'completed' => (int) ($counts['completed'] ?? 0),
'failed' => (int) ($counts['failed'] ?? 0),
'by_category' => $byCategory,
];
}
private function createConnection(): PDO
{
return \Infrastructure\Config\DatabaseFactory::dev();
}
}
Vollständig herunterladen
Aktionen
Andere Versionen dieser Datei
| ID |
Version |
Typ |
Größe |
Datum |
| 1864 |
19 |
modified |
3.5 KB |
2025-12-27 23:48 |
| 1860 |
18 |
modified |
3.5 KB |
2025-12-27 23:47 |
| 1492 |
17 |
modified |
3.5 KB |
2025-12-25 17:28 |
| 1441 |
16 |
modified |
3.5 KB |
2025-12-25 17:00 |
| 1425 |
15 |
modified |
4.5 KB |
2025-12-25 16:59 |
| 1419 |
14 |
modified |
13.2 KB |
2025-12-25 16:59 |
| 1406 |
13 |
modified |
12.8 KB |
2025-12-25 16:58 |
| 1399 |
12 |
modified |
12.4 KB |
2025-12-25 16:58 |
| 1395 |
11 |
modified |
12.5 KB |
2025-12-25 16:58 |
| 856 |
10 |
modified |
12.6 KB |
2025-12-23 08:46 |
| 855 |
9 |
modified |
12.7 KB |
2025-12-23 08:46 |
| 786 |
8 |
modified |
12.9 KB |
2025-12-23 08:05 |
| 398 |
7 |
modified |
12.9 KB |
2025-12-22 08:49 |
| 397 |
6 |
modified |
13.0 KB |
2025-12-22 08:49 |
| 328 |
5 |
modified |
13.0 KB |
2025-12-22 08:08 |
| 327 |
4 |
modified |
12.9 KB |
2025-12-22 08:08 |
| 326 |
3 |
modified |
12.9 KB |
2025-12-22 08:08 |
| 36 |
2 |
modified |
13.7 KB |
2025-12-20 17:23 |
| 27 |
1 |
modified |
13.7 KB |
2025-12-20 17:18 |
← Zurück zur Übersicht