PipelineStepConfig.php
- Pfad:
src/Infrastructure/Config/PipelineStepConfig.php - Namespace: Infrastructure\Config
- Zeilen: 415 | Größe: 17,456 Bytes
- Geändert: 2025-12-31 02:37:24 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 86
- Dependencies: 100 (25%)
- LOC: 28 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Klassen 1
-
PipelineStepConfigclass Zeile 9
Funktionen 5
-
getStepTypes()public Zeile 16 -
getCollections()public Zeile 350 -
getDefaultSteps()public Zeile 365 -
getDefaultExtensions()public Zeile 381 -
parseExtensions()public Zeile 392
Verwendet von 3
Versionen 16
-
v16
2025-12-31 02:37 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v15
2025-12-31 02:21 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v14
2025-12-28 01:52 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v13
2025-12-28 01:52 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v12
2025-12-28 01:51 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v11
2025-12-28 01:51 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v10
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v9
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v8
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v7
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v6
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v5
2025-12-28 01:16 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v4
2025-12-28 01:13 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v3
2025-12-28 01:12 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v2
2025-12-28 01:12 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation -
v1
2025-12-23 08:02 | claude-code-hook | modified
Claude Code Pre-Hook Backup vor Edit-Operation
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Config;
// @responsibility: Konfiguration für Pipeline-Schritte und Collections
final class PipelineStepConfig
{
/**
* Get all available step types with their configuration.
*
* @return array<string, array<string, mixed>>
*/
public static function getStepTypes(): array
{
return [
// Phase 1: Vorverarbeitung
'detect' => [
'label' => 'Erkennung',
'description' => 'Dateien scannen und Format prüfen',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'validate' => [
'label' => 'Validierung',
'description' => 'Datei-Prüfung auf Lesbarkeit und Korruption',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'page_split' => [
'label' => 'Seitenzerlegung',
'description' => 'PDF in Einzelseiten zerlegen für Referenz und Vision-Analyse',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.document_pages',
],
'vision_analyze' => [
'label' => 'Bildanalyse',
'description' => 'Seiten via Vision-Modell analysieren, Bilder und Grafiken erkennen',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.document_pages (vision_analysis)',
'uses_vision' => true,
],
'extract' => [
'label' => 'Textextraktion',
'description' => 'Text extrahieren, OCR für Bilder mit Text',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'structure' => [
'label' => 'Strukturerkennung',
'description' => 'Überschriften, Listen und Hierarchie erkennen',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.document_sections',
],
'segment' => [
'label' => 'Abschnitte',
'description' => 'Logische Dokumentgliederung nach Struktur',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.document_sections',
],
'chunk' => [
'label' => 'Textbausteine',
'description' => 'Chunks erstellen (max 800 Token) mit Seitenreferenz',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.chunks',
],
'queue' => [
'label' => 'Warteschlange',
'description' => 'Dokument zur Verarbeitung einreihen',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'hash' => [
'label' => 'Hash-Berechnung',
'description' => 'SHA256-Hash für Duplikat-Erkennung berechnen',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'duplicate_check' => [
'label' => 'Duplikat-Prüfung',
'description' => 'Hash-Vergleich, bei Treffer Pipeline-Abbruch',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.documents (status)',
],
'rotation' => [
'label' => 'Seitenausrichtung',
'description' => 'Seiten-Rotation per OSD korrigieren',
'phase' => 'Vorverarbeitung',
'storage' => null,
],
'enrich' => [
'label' => 'Anreicherung',
'description' => 'Überschriften und Keywords extrahieren',
'phase' => 'Vorverarbeitung',
'storage' => 'ki_content.chunks (headings, keywords)',
],
// Phase 2: Textsemantik (WIE wird etwas gesagt?)
'text_semantic_analyze' => [
'label' => 'Textsemantik-Analyse',
'description' => 'Aussageform, Intent, Frame, Negation pro Chunk analysieren',
'phase' => 'Textsemantik',
'storage' => null,
'uses_llm' => true,
],
'text_semantic_store' => [
'label' => 'Textsemantik-Speicherung',
'description' => 'Analyse-Ergebnisse in chunk_semantics speichern',
'phase' => 'Textsemantik',
'storage' => 'ki_content.chunk_semantics',
],
// Phase 3: Speicherung & Vektorisierung
'metadata_store' => [
'label' => 'DB-Speicherung',
'description' => 'Dokument, Seiten und Chunks in MariaDB speichern',
'phase' => 'Speicherung',
'storage' => 'ki_content.documents, .document_pages, .chunks',
],
'embed' => [
'label' => 'Vektorisierung',
'description' => 'Embeddings erstellen für Vektor-Suche',
'phase' => 'Speicherung',
'storage' => 'Qdrant: {collection}',
'fixed_model' => 'mxbai-embed-large (1024-dim)',
'has_collection' => true,
],
'collection_setup' => [
'label' => 'Collection',
'description' => 'Qdrant-Collection einrichten falls nötig',
'phase' => 'Speicherung',
'storage' => 'Qdrant: {collection}',
],
'vector_store' => [
'label' => 'Vektorspeicherung',
'description' => 'Vektoren in Qdrant mit MariaDB-ID als Referenz',
'phase' => 'Speicherung',
'storage' => 'Qdrant: {collection}',
],
'index_optimize' => [
'label' => 'Index-Optimierung',
'description' => 'HNSW-Index für schnelle Suche optimieren',
'phase' => 'Speicherung',
'storage' => 'Qdrant: {collection}',
],
'doc_create' => [
'label' => 'Dokument-Eintrag',
'description' => 'Dokument-Datensatz in Datenbank erstellen',
'phase' => 'Speicherung',
'storage' => 'ki_content.documents',
],
'page_store' => [
'label' => 'Seiten-Speicherung',
'description' => 'Einzelseiten in Datenbank speichern',
'phase' => 'Speicherung',
'storage' => 'ki_content.document_pages',
],
'vision' => [
'label' => 'Bild-Analyse',
'description' => 'Seiten via Vision-LLM analysieren',
'phase' => 'Analyse',
'storage' => null,
'uses_llm' => true,
],
'vision_store' => [
'label' => 'Vision-Speicherung',
'description' => 'Vision-Analyse-Ergebnisse speichern',
'phase' => 'Speicherung',
'storage' => 'ki_content.document_pages (vision_analysis)',
],
'chunk_store' => [
'label' => 'Chunk-Speicherung',
'description' => 'Textbausteine in Datenbank speichern',
'phase' => 'Speicherung',
'storage' => 'ki_content.chunks',
],
'qdrant_store' => [
'label' => 'Vektor-Speicherung',
'description' => 'Embedding-Vektoren in Qdrant speichern',
'phase' => 'Speicherung',
'storage' => 'Qdrant: {collection}',
],
'status_update' => [
'label' => 'Status-Update',
'description' => 'Dokument-Status aktualisieren',
'phase' => 'Speicherung',
'storage' => 'ki_content.documents (status)',
],
// Phase 3: Wissensextraktion
'knowledge_page' => [
'label' => 'Seiten-Wissen',
'description' => 'Pro Seite: Entitäten → Semantik → Ontologie → Taxonomie',
'phase' => 'Wissen',
'storage' => 'ki_content.page_knowledge, .entities, .entity_semantics',
'uses_llm' => true,
],
'knowledge_section' => [
'label' => 'Abschnitt-Wissen',
'description' => 'Pro Kapitel: Aggregierte Wissensrepräsentation',
'phase' => 'Wissen',
'storage' => 'ki_content.section_knowledge',
'uses_llm' => true,
],
'knowledge_document' => [
'label' => 'Dokument-Wissen',
'description' => 'Konsolidierte Gesamtsicht des Dokuments',
'phase' => 'Wissen',
'storage' => 'ki_content.document_knowledge',
'uses_llm' => true,
],
'knowledge_validate' => [
'label' => 'Wissens-Validierung',
'description' => 'Abgleich mit DB, Duplikate zusammenführen, neue validieren',
'phase' => 'Wissen',
'storage' => 'ki_content.entities (merged)',
],
// Phase 4: Wissensextraktion (Analyse)
'entity_extract' => [
'label' => 'Entitäten-Extraktion',
'description' => 'Personen, Organisationen, Konzepte, Methoden erkennen',
'phase' => 'Analyse',
'storage' => 'ki_content.chunk_entities',
'uses_llm' => true,
],
'relation_extract' => [
'label' => 'Beziehungs-Extraktion',
'description' => 'Relationen zwischen Entitäten extrahieren',
'phase' => 'Analyse',
'storage' => 'ki_content.entity_relations',
'uses_llm' => true,
],
'taxonomy_build' => [
'label' => 'Taxonomie-Aufbau',
'description' => 'Hierarchische Kategorisierung aufbauen',
'phase' => 'Analyse',
'storage' => 'ki_content.chunk_taxonomy, .taxonomy_terms',
'uses_llm' => true,
],
'semantic_analyze' => [
'label' => 'Semantik-Analyse',
'description' => 'Bedeutungs-Analyse, Konzepte und Definitionen',
'phase' => 'Analyse',
'storage' => 'ki_content.chunk_semantics',
'uses_llm' => true,
],
'summarize' => [
'label' => 'Zusammenfassung',
'description' => 'Dokument- und Seiten-Zusammenfassungen erstellen',
'phase' => 'Analyse',
'storage' => 'ki_content.documents (summary), .document_pages',
'uses_llm' => true,
],
'question_generate' => [
'label' => 'Fragengenerierung',
'description' => 'Beispielfragen für RAG-Chat erstellen',
'phase' => 'Analyse',
'storage' => 'ki_content.generated_questions',
'uses_llm' => true,
],
'entity_store' => [
'label' => 'Entitäten-Speicherung',
'description' => 'Extrahierte Entitäten in Datenbank speichern',
'phase' => 'Wissen',
'storage' => 'ki_content.entities, .document_entities',
],
'entity_normalize' => [
'label' => 'Entitäten-Normalisierung',
'description' => 'Duplikate zusammenführen, Synonyme verknüpfen',
'phase' => 'Wissen',
'storage' => 'ki_content.entity_synonyms',
],
// Phase: Wissenssemantik (WAS bedeutet eine Entität im Kontext?)
'knowledge_semantic_analyze' => [
'label' => 'Wissenssemantik-Analyse',
'description' => 'Bedeutung der Entitäten im Kontext: Rolle, Eigenschaften, Funktion',
'phase' => 'Wissenssemantik',
'storage' => null,
'uses_llm' => true,
],
'knowledge_semantic_store' => [
'label' => 'Wissenssemantik-Speicherung',
'description' => 'Analyse-Ergebnisse in entity_semantics speichern',
'phase' => 'Wissenssemantik',
'storage' => 'ki_content.entity_semantics',
],
'relation_store' => [
'label' => 'Beziehungs-Speicherung',
'description' => 'Extrahierte Relationen in Datenbank speichern',
'phase' => 'Wissen',
'storage' => 'ki_content.entity_relations',
],
'ontology_classify' => [
'label' => 'Ontologie-Klassifikation',
'description' => 'Entitäten in Ontologie-Klassen einordnen',
'phase' => 'Wissen',
'storage' => 'ki_content.ontology_classes',
'uses_llm' => true,
],
'ontology_store' => [
'label' => 'Ontologie-Speicherung',
'description' => 'Ontologie-Klassifikationen in Datenbank speichern',
'phase' => 'Wissen',
'storage' => 'ki_content.ontology_classes',
],
'chunk_entity_link' => [
'label' => 'Chunk-Entitäten-Verknüpfung',
'description' => 'Chunks mit erkannten Entitäten verknüpfen',
'phase' => 'Wissen',
'storage' => 'ki_content.chunk_entities',
],
'chunk_taxonomy' => [
'label' => 'Chunk-Taxonomie',
'description' => 'Chunks mit Taxonomie-Kategorien verknüpfen',
'phase' => 'Wissen',
'storage' => 'ki_content.chunk_taxonomy',
],
'entity_taxonomy' => [
'label' => 'Entitäten-Taxonomie',
'description' => 'Entitäten mit Taxonomie-Pfaden verknüpfen',
'phase' => 'Wissen',
'storage' => 'ki_content.entity_taxonomy_mapping',
],
'chunk_semantics' => [
'label' => 'Chunk-Semantik',
'description' => 'Semantische Analyse-Ergebnisse pro Chunk speichern',
'phase' => 'Wissen',
'storage' => 'ki_content.chunk_semantics',
],
'finalize' => [
'label' => 'Abschluss',
'description' => 'Status finalisieren und Job beenden',
'phase' => 'Analyse',
'storage' => 'ki_content.documents (status)',
],
'analyze' => [
'label' => 'Kombinierte Analyse',
'description' => 'Entitäten, Relationen und Taxonomie in einem Schritt',
'phase' => 'Analyse',
'storage' => 'ki_content.chunk_entities, .chunk_semantics',
'uses_llm' => true,
],
];
}
/**
* Get available Qdrant collections.
*
* @return array<string, string>
*/
public static function getCollections(): array
{
return [
'documents' => 'Documents (Schulungsunterlagen)',
'mail' => 'Mail (E-Mails)',
'entities' => 'Entities (Entitäten)',
'knowledge' => 'Knowledge (Wissensbasis)',
];
}
/**
* Get default steps for new pipelines.
*
* @return array<array<string, mixed>>
*/
public static function getDefaultSteps(): array
{
return [
['step_type' => 'detect', 'config' => ['hash_algorithm' => 'sha256'], 'sort_order' => 1, 'enabled' => 1],
['step_type' => 'extract', 'config' => ['ocr_enabled' => true, 'ocr_language' => 'deu'], 'sort_order' => 2, 'enabled' => 1],
['step_type' => 'chunk', 'config' => ['min_size' => 100, 'max_size' => 2000, 'overlap' => 0.1], 'sort_order' => 3, 'enabled' => 1],
['step_type' => 'embed', 'config' => ['model' => 'mxbai-embed-large', 'collection' => 'documents', 'dimensions' => 1024], 'sort_order' => 4, 'enabled' => 1],
['step_type' => 'analyze', 'config' => ['extract_entities' => true, 'extract_relations' => true, 'classify_taxonomy' => true], 'sort_order' => 5, 'enabled' => 0],
];
}
/**
* Get default file extensions.
*
* @return array<string>
*/
public static function getDefaultExtensions(): array
{
return ['.pdf', '.docx', '.pptx', '.md', '.txt'];
}
/**
* Parse extension string into array.
*
* @param string $input
* @return array<string>
*/
public static function parseExtensions(string $input): array
{
$extensions = [];
$parts = preg_split('/[\s,;]+/', $input);
if ($parts === false) {
return self::getDefaultExtensions();
}
foreach ($parts as $ext) {
$ext = trim($ext);
if ($ext === '') {
continue;
}
if ($ext[0] !== '.') {
$ext = '.' . $ext;
}
$extensions[] = strtolower($ext);
}
return $extensions !== [] ? $extensions : self::getDefaultExtensions();
}
}