ChunkAnalyzer.php
- Pfad:
src/Infrastructure/Docs/ChunkAnalyzer.php - Namespace: Infrastructure\Docs
- Zeilen: 224 | Größe: 6,604 Bytes
- Geändert: 2025-12-25 16:57:23 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 98
- Dependencies: 100 (25%)
- LOC: 92 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 3
- constructor Infrastructure\AI\OllamaService
- use Infrastructure\AI\OllamaService
- use RuntimeException
Klassen 1
-
ChunkAnalyzerclass Zeile 12
Funktionen 9
-
__construct()public Zeile 17 -
analyze()public Zeile 29 -
buildAnalysisPrompt()private Zeile 57 -
callLlmWithRetry()Zeile 84 -
parseAnalysisResponse()Zeile 109 -
validateArray()Zeile 144 -
validateEntities()Zeile 162 -
deriveTaxonomyFromPath()Zeile 188 -
decodeJsonArray()Zeile 213
Verwendet von 2
- ChunkAnalysisService.php constructor
- InfrastructureServiceProvider.php use
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Docs;
// @responsibility: Analysiert Chunks via LLM (Taxonomie, Entities, Keywords)
use Infrastructure\AI\OllamaService;
use RuntimeException;
final class ChunkAnalyzer
{
private const string TAXONOMY_MODEL = 'gemma3:4b-it-qat';
private const int MAX_RETRIES = 3;
public function __construct(
private OllamaService $ollama
) {
}
/**
* Performs LLM analysis on chunk content.
*
* @param array<string, mixed> $chunk
* @param array{title: string, path: string} $docContext
* @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
*/
public function analyze(array $chunk, array $docContext): array
{
$content = $chunk['content_clean'] ?? $chunk['content'];
$headingPath = $this->decodeJsonArray($chunk['heading_path'] ?? null);
$context = sprintf(
"Dokument: %s\nPfad: %s\nAbschnitt: %s\n\nInhalt:\n%s",
$docContext['title'],
$docContext['path'],
implode(' > ', $headingPath),
$content
);
$prompt = $this->buildAnalysisPrompt($context);
$response = $this->callLlmWithRetry($prompt);
$analysis = $this->parseAnalysisResponse($response);
// Fallback: If no taxonomy, derive from document path
if (empty($analysis['taxonomy'])) {
$analysis['taxonomy'] = $this->deriveTaxonomyFromPath($docContext['path']);
}
return $analysis;
}
/**
* Builds the analysis prompt.
*/
private function buildAnalysisPrompt(string $context): string
{
return <<<PROMPT
Analysiere den folgenden technischen Dokumentationsabschnitt und extrahiere strukturierte Informationen.
{$context}
Antworte NUR mit einem JSON-Objekt in diesem exakten Format (keine Erklärungen):
{
"taxonomy": ["Hauptkategorie", "Unterkategorie", "Thema"],
"entities": [
{"name": "Entitätsname", "type": "TECHNOLOGY|CONCEPT|CONFIG|COMMAND|SERVICE"}
],
"keywords": ["keyword1", "keyword2", "keyword3"]
}
Regeln:
- taxonomy: Hierarchische Klassifikation (3 Ebenen: Bereich > Modul > Thema)
- entities: Wichtige Technologien, Konzepte, Konfigurationen, Befehle, Dienste
- keywords: 3-5 relevante Suchbegriffe
- Antworte NUR mit dem JSON, keine anderen Texte
PROMPT;
}
/**
* Calls the LLM with retry logic.
*/
private function callLlmWithRetry(string $prompt): string
{
$lastError = new RuntimeException('No attempts made');
for ($attempt = 1; $attempt <= self::MAX_RETRIES; $attempt++) {
try {
return $this->ollama->generate($prompt, self::TAXONOMY_MODEL);
} catch (RuntimeException $e) {
$lastError = $e;
if ($attempt < self::MAX_RETRIES) {
usleep(500000 * $attempt); // Progressive backoff
}
}
}
throw new RuntimeException(
'LLM call failed after ' . self::MAX_RETRIES . ' attempts: ' . $lastError->getMessage()
);
}
/**
* Parses the LLM response into structured data.
*
* @return array{taxonomy: array<string>, entities: array<array{name: string, type: string}>, keywords: array<string>}
*/
private function parseAnalysisResponse(string $response): array
{
$default = [
'taxonomy' => [],
'entities' => [],
'keywords' => [],
];
// Extract JSON from response (handle markdown code blocks)
$json = $response;
if (preg_match('/```(?:json)?\s*([\s\S]*?)\s*```/', $response, $matches)) {
$json = $matches[1];
} elseif (preg_match('/\{[\s\S]*\}/', $response, $matches)) {
$json = $matches[0];
}
$decoded = json_decode($json, true);
if (!is_array($decoded)) {
return $default;
}
return [
'taxonomy' => $this->validateArray($decoded['taxonomy'] ?? []),
'entities' => $this->validateEntities($decoded['entities'] ?? []),
'keywords' => $this->validateArray($decoded['keywords'] ?? []),
];
}
/**
* Validates an array of strings.
*
* @param mixed $arr
* @return array<string>
*/
private function validateArray(mixed $arr): array
{
if (!is_array($arr)) {
return [];
}
return array_values(array_filter(
$arr,
static fn ($item): bool => is_string($item) && trim($item) !== ''
));
}
/**
* Validates entities array.
*
* @param mixed $entities
* @return array<array{name: string, type: string}>
*/
private function validateEntities(mixed $entities): array
{
if (!is_array($entities)) {
return [];
}
$result = [];
foreach ($entities as $entity) {
if (is_array($entity) && isset($entity['name']) && is_string($entity['name'])) {
$result[] = [
'name' => trim($entity['name']),
'type' => isset($entity['type']) && is_string($entity['type'])
? strtoupper($entity['type'])
: 'OTHER',
];
}
}
return $result;
}
/**
* Derives taxonomy from document path.
*
* @return array<string>
*/
private function deriveTaxonomyFromPath(string $path): array
{
$parts = array_filter(explode('/', trim($path, '/')));
$mapping = [
'server' => 'Server',
'modul' => 'Module',
'anwendungen' => 'Anwendungen',
'mcp' => 'MCP-Server',
'ki-tasks' => 'KI-Tasks',
];
$taxonomy = [];
foreach ($parts as $part) {
$taxonomy[] = $mapping[$part] ?? ucfirst($part);
}
return array_slice($taxonomy, 0, 3);
}
/**
* Decodes JSON array safely.
*
* @return array<string>
*/
private function decodeJsonArray(?string $json): array
{
if ($json === null || $json === '') {
return [];
}
$decoded = json_decode($json, true);
return is_array($decoded) ? $decoded : [];
}
}