step_extract.py
- Pfad:
/var/www/scripts/pipeline/step_extract.py - Namespace: pipeline
- Zeilen: 105 | Größe: 3,006 Bytes
- Geändert: 2025-12-25 16:55:42 | Gescannt: 2025-12-31 10:22:15
Code Hygiene Score: 100
- Dependencies: 100 (25%)
- LOC: 100 (20%)
- Methods: 100 (20%)
- Secrets: 100 (15%)
- Classes: 100 (10%)
- Magic Numbers: 100 (10%)
Keine Issues gefunden.
Dependencies 5
- use hashlib
- use os
- use pathlib.Path
- use extract.extract
- use extract.get_full_text
Klassen 1
-
ExtractionStepclass Zeile 15
Code
"""
Extraction Step Module
Handles document text extraction for all supported formats.
Part of modularized pipeline architecture.
"""
import hashlib
import os
from pathlib import Path
from extract import extract, get_full_text
class ExtractionStep:
"""Step: Extract text from documents."""
def __init__(self, db, progress=None):
"""
Initialize extraction step.
Args:
db: Database instance
progress: Optional PipelineProgress instance
"""
self.db = db
self.progress = progress
def execute(self, file_path):
"""
Extract text from document.
Args:
file_path: Path to document file
Returns:
dict: {
'success': bool,
'extraction': dict with content/metadata,
'file_info': dict with hash/size/type,
'error': str (if failed)
}
"""
file_name = Path(file_path).name
if self.progress:
self.progress.update_step("extract")
self.progress.add_log(f"Extrahiere Text: {file_name}")
self.db.log("INFO", f"Extracting: {file_path}")
# Check cancellation
if self.progress and self.progress.is_cancelled():
return {"success": False, "error": "cancelled"}
# Extract content
extraction = extract(file_path)
if not extraction["success"]:
error = extraction.get("error", "Unknown extraction error")
self.db.log("ERROR", f"Extraction failed: {error}")
if self.progress:
self.progress.add_log("FEHLER: Extraktion fehlgeschlagen")
return {"success": False, "error": error}
# Get file metadata
file_stat = os.stat(file_path)
with open(file_path, "rb") as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
file_info = {
"path": file_path,
"name": file_name,
"type": extraction["file_type"],
"size": file_stat.st_size,
"hash": file_hash,
}
# Count pages for PDFs
total_pages = 0
if extraction["file_type"] == ".pdf" and isinstance(extraction.get("content"), list):
total_pages = len(extraction["content"])
if self.progress:
self.progress.add_log(f"{total_pages} Seiten extrahiert")
self.db.log("INFO", f"Extraction complete: {total_pages} pages" if total_pages else "Extraction complete")
return {
"success": True,
"extraction": extraction,
"file_info": file_info,
"total_pages": total_pages,
}
def get_full_text_from_extraction(self, extraction):
"""
Get full text from extraction result.
Args:
extraction: Extraction result dict
Returns:
str: Full document text
"""
return get_full_text(extraction)