ContentSearchService.php
- Pfad:
src/Infrastructure/Search/ContentSearchService.php
- Namespace: Infrastructure\Search
- Zeilen: 365 | Größe: 11,932 Bytes
- Geändert: 2025-12-30 03:15:35 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 68
- Dependencies: 90 (25%)
- LOC: 0 (20%)
- Methods: 50 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Issues 1
| Zeile |
Typ |
Beschreibung |
| - |
complexity |
Datei hat 365 Zeilen (max: 350) |
Dependencies 10
- implements Domain\Service\SearchServiceInterface
- trait Infrastructure\Traits\JsonDecodeTrait
- constructor PDO
- constructor Infrastructure\AI\OllamaService
- use Domain\Service\SearchServiceInterface
- use Infrastructure\AI\OllamaService
- use Infrastructure\Config\CredentialService
- use Infrastructure\Traits\JsonDecodeTrait
- use PDO
- use RuntimeException
Klassen 1
-
ContentSearchService
class
Zeile 16
Funktionen 15
-
__construct()
public
Zeile 26
-
search()
public
Zeile 34
-
searchDefinitions()
public
Zeile 53
-
searchByIntent()
public
Zeile 59
-
searchByTaxonomy()
public
Zeile 65
-
suggestRelatedSearches()
public
Zeile 71
-
getSemanticStats()
public
Zeile 87
-
semanticSearch()
private
Zeile 104
-
enrichWithSemantics()
private
Zeile 136
-
getChunkWithSemantics()
private
Zeile 182
-
matchesFilters()
private
Zeile 210
-
chunkMatchesTaxonomy()
private
Zeile 236
-
rerank()
private
Zeile 253
-
isDefinitionQuery()
private
Zeile 304
-
makeRequest()
private
Zeile 324
Verwendet von 1
Code
<?php
declare(strict_types=1);
namespace Infrastructure\Search;
// @responsibility: Semantische Suche über ki_content mit Chunk-Semantik
use Domain\Service\SearchServiceInterface;
use Infrastructure\AI\OllamaService;
use Infrastructure\Config\CredentialService;
use Infrastructure\Traits\JsonDecodeTrait;
use PDO;
use RuntimeException;
final class ContentSearchService implements SearchServiceInterface
{
use JsonDecodeTrait;
private const string COLLECTION = 'documents';
private const int TIMEOUT = 30;
private const float MIN_SCORE = 0.3;
private string $qdrantHost;
public function __construct(
private PDO $pdo,
private OllamaService $ollama
) {
$this->qdrantHost = CredentialService::getQdrantHost();
}
/** Hybrid search combining semantic vectors with SQL filters. */
public function search(string $query, array $filters = [], int $limit = 10): array
{
// Stage 1: Semantic search in Qdrant
$vectorResults = $this->semanticSearch($query, $limit * 3);
if (empty($vectorResults)) {
return [];
}
// Stage 2: Enrich with SQL data (chunks + semantics)
$enrichedResults = $this->enrichWithSemantics($vectorResults, $filters);
// Stage 3: Re-rank based on semantic relevance
$rankedResults = $this->rerank($enrichedResults, $query);
return array_slice($rankedResults, 0, $limit);
}
/** Searches for definition chunks only. */
public function searchDefinitions(string $query, int $limit = 10): array
{
return $this->search($query, ['discourse_role' => 'definition'], $limit);
}
/** Searches for chunks with specific intent. */
public function searchByIntent(string $query, string $intent, int $limit = 10): array
{
return $this->search($query, ['intent' => $intent], $limit);
}
/** Searches within a taxonomy path. */
public function searchByTaxonomy(string $query, string $taxonomyPath, int $limit = 10): array
{
return $this->search($query, ['taxonomy_path' => $taxonomyPath], $limit);
}
/** Suggests related searches based on current results. */
public function suggestRelatedSearches(array $results): array
{
$suggestions = [];
foreach ($results as $result) {
foreach ($result['keywords'] ?? [] as $keyword) {
if (!in_array($keyword, $suggestions, true) && count($suggestions) < 5) {
$suggestions[] = $keyword;
}
}
}
return $suggestions;
}
/** Gets semantic statistics for the content. */
public function getSemanticStats(): array
{
$stmt = $this->pdo->query('
SELECT
cs.intent,
cs.discourse_role,
COUNT(*) as count
FROM ki_content.chunk_semantics cs
WHERE cs.intent IS NOT NULL
GROUP BY cs.intent, cs.discourse_role
ORDER BY count DESC
');
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/** Performs semantic search in Qdrant. */
private function semanticSearch(string $query, int $limit): array
{
$embedding = $this->ollama->getEmbedding($query);
$url = sprintf('%s/collections/%s/points/search', $this->qdrantHost, self::COLLECTION);
$payload = [
'vector' => array_values($embedding),
'limit' => $limit,
'with_payload' => true,
];
try {
$response = $this->makeRequest($url, $payload);
if (!isset($response['result']) || !is_array($response['result'])) {
return [];
}
return array_map(static function (array $item): array {
return [
'qdrant_id' => (string) $item['id'],
'score' => (float) ($item['score'] ?? 0),
'payload' => is_array($item['payload'] ?? null) ? $item['payload'] : [],
];
}, $response['result']);
} catch (RuntimeException) {
return [];
}
}
/** Enriches vector results with chunk semantics from database. */
private function enrichWithSemantics(array $vectorResults, array $filters): array
{
$results = [];
$minScore = $filters['min_score'] ?? self::MIN_SCORE;
foreach ($vectorResults as $vr) {
if ($vr['score'] < $minScore) {
continue;
}
$qdrantId = $vr['qdrant_id'];
$chunkData = $this->getChunkWithSemantics($qdrantId);
if ($chunkData === null) {
continue;
}
// Apply filters
if (!$this->matchesFilters($chunkData, $filters)) {
continue;
}
$results[] = [
'chunk_id' => (int) $chunkData['chunk_id'],
'document_id' => (int) $chunkData['document_id'],
'source_path' => $chunkData['source_path'] ?? '',
'content' => $chunkData['content'] ?? '',
'heading_path' => $chunkData['heading_path'] ?? '',
// Semantic data
'summary' => $chunkData['summary'] ?? null,
'keywords' => $this->decodeJsonArray($chunkData['keywords'] ?? null),
'sentiment' => $chunkData['sentiment'] ?? 'neutral',
'intent' => $chunkData['intent'] ?? null,
'discourse_role' => $chunkData['discourse_role'] ?? null,
'statement_form' => $chunkData['statement_form'] ?? null,
'frame' => $chunkData['frame'] ?? null,
// Scores
'score' => $vr['score'],
'relevance_score' => $vr['score'],
];
}
return $results;
}
/** Gets chunk with semantic data from ki_content. */
private function getChunkWithSemantics(string $qdrantId): ?array
{
$stmt = $this->pdo->prepare('
SELECT
c.id as chunk_id,
c.document_id,
c.content,
c.heading_path,
d.source_path,
cs.summary,
cs.keywords,
cs.sentiment,
cs.intent,
cs.discourse_role,
cs.statement_form,
cs.frame
FROM ki_content.chunks c
JOIN ki_content.documents d ON c.document_id = d.id
LEFT JOIN ki_content.chunk_semantics cs ON c.id = cs.chunk_id
WHERE c.qdrant_id = :qdrant_id
');
$stmt->execute(['qdrant_id' => $qdrantId]);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
return $result !== false ? $result : null;
}
/** Checks if chunk matches filters. */
private function matchesFilters(array $chunk, array $filters): bool
{
if (isset($filters['intent']) && ($chunk['intent'] ?? null) !== $filters['intent']) {
return false;
}
if (isset($filters['discourse_role']) && ($chunk['discourse_role'] ?? null) !== $filters['discourse_role']) {
return false;
}
if (isset($filters['sentiment']) && ($chunk['sentiment'] ?? null) !== $filters['sentiment']) {
return false;
}
if (isset($filters['frame']) && ($chunk['frame'] ?? null) !== $filters['frame']) {
return false;
}
if (isset($filters['taxonomy_path'])) {
// Check if chunk is in taxonomy path via chunk_taxonomy
$hasMatch = $this->chunkMatchesTaxonomy((int) $chunk['chunk_id'], $filters['taxonomy_path']);
if (!$hasMatch) {
return false;
}
}
return true;
}
/** Checks if chunk belongs to taxonomy path. */
private function chunkMatchesTaxonomy(int $chunkId, string $taxonomyPath): bool
{
$stmt = $this->pdo->prepare('
SELECT 1 FROM ki_content.chunk_taxonomy ct
JOIN ki_content.taxonomy_terms tt ON ct.taxonomy_term_id = tt.id
WHERE ct.chunk_id = :chunk_id AND tt.path LIKE :path
LIMIT 1
');
$stmt->execute([
'chunk_id' => $chunkId,
'path' => $taxonomyPath . '%',
]);
return $stmt->fetch() !== false;
}
/** Re-ranks results based on semantic relevance. */
private function rerank(array $results, string $query): array
{
$queryWords = array_filter(preg_split('/\s+/', strtolower($query)) ?: []);
$isDefinitionQuery = $this->isDefinitionQuery($query);
foreach ($results as &$result) {
$boost = 0.0;
// Keyword matching
foreach ($result['keywords'] as $kw) {
foreach ($queryWords as $w) {
if (stripos($kw, $w) !== false) {
$boost += 0.05;
}
}
}
// Semantic boost: discourse role
$discourseRole = $result['discourse_role'] ?? null;
if ($isDefinitionQuery && $discourseRole === 'definition') {
$boost += 0.15;
} elseif ($discourseRole === 'thesis') {
$boost += 0.08;
} elseif ($discourseRole === 'evidence') {
$boost += 0.05;
}
// Semantic boost: intent
$intent = $result['intent'] ?? null;
if ($intent === 'explain' || $intent === 'define') {
$boost += 0.05;
}
// Summary matching
if ($result['summary'] !== null) {
foreach ($queryWords as $w) {
if (stripos($result['summary'], $w) !== false) {
$boost += 0.03;
}
}
}
$result['relevance_score'] = min(1.0, $result['score'] + $boost);
}
usort($results, static fn (array $a, array $b): int => $b['relevance_score'] <=> $a['relevance_score']);
return $results;
}
/** Detects if query is asking for a definition. */
private function isDefinitionQuery(string $query): bool
{
$patterns = [
'/^was\s+ist\b/i',
'/^was\s+sind\b/i',
'/^was\s+bedeutet\b/i',
'/^definition\b/i',
'/^erkl[äa]r/i',
];
foreach ($patterns as $pattern) {
if (preg_match($pattern, $query)) {
return true;
}
}
return false;
}
/** Makes HTTP request to Qdrant. */
private function makeRequest(string $url, array $payload): array
{
$ch = curl_init($url);
if ($ch === false) {
throw new RuntimeException('Failed to initialize cURL');
}
$jsonPayload = json_encode($payload);
if ($jsonPayload === false) {
curl_close($ch);
throw new RuntimeException('Failed to encode JSON');
}
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => self::TIMEOUT,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $jsonPayload,
CURLOPT_HTTPHEADER => [
'Content-Type: application/json',
'Content-Length: ' . strlen($jsonPayload),
],
]);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($result === false || $httpCode >= 400) {
throw new RuntimeException(sprintf('Qdrant request failed: HTTP %d', $httpCode));
}
$decoded = json_decode((string) $result, true);
return is_array($decoded) ? $decoded : [];
}
}