step_extract.py

Code Hygiene Score: 100

Keine Issues gefunden.

Dependencies 5

Klassen 1

Code

"""
Extraction Step Module
Handles document text extraction for all supported formats.

Part of modularized pipeline architecture.
"""

import hashlib
import os
from pathlib import Path

from extract import extract, get_full_text


class ExtractionStep:
    """Step: Extract text from documents."""

    def __init__(self, db, progress=None):
        """
        Initialize extraction step.

        Args:
            db: Database instance
            progress: Optional PipelineProgress instance
        """
        self.db = db
        self.progress = progress

    def execute(self, file_path):
        """
        Extract text from document.

        Args:
            file_path: Path to document file

        Returns:
            dict: {
                'success': bool,
                'extraction': dict with content/metadata,
                'file_info': dict with hash/size/type,
                'error': str (if failed)
            }
        """
        file_name = Path(file_path).name

        if self.progress:
            self.progress.update_step("extract")
            self.progress.add_log(f"Extrahiere Text: {file_name}")

        self.db.log("INFO", f"Extracting: {file_path}")

        # Check cancellation
        if self.progress and self.progress.is_cancelled():
            return {"success": False, "error": "cancelled"}

        # Extract content
        extraction = extract(file_path)
        if not extraction["success"]:
            error = extraction.get("error", "Unknown extraction error")
            self.db.log("ERROR", f"Extraction failed: {error}")
            if self.progress:
                self.progress.add_log("FEHLER: Extraktion fehlgeschlagen")
            return {"success": False, "error": error}

        # Get file metadata
        file_stat = os.stat(file_path)
        with open(file_path, "rb") as f:
            file_hash = hashlib.sha256(f.read()).hexdigest()

        file_info = {
            "path": file_path,
            "name": file_name,
            "type": extraction["file_type"],
            "size": file_stat.st_size,
            "hash": file_hash,
        }

        # Count pages for PDFs
        total_pages = 0
        if extraction["file_type"] == ".pdf" and isinstance(extraction.get("content"), list):
            total_pages = len(extraction["content"])
            if self.progress:
                self.progress.add_log(f"{total_pages} Seiten extrahiert")

        self.db.log("INFO", f"Extraction complete: {total_pages} pages" if total_pages else "Extraction complete")

        return {
            "success": True,
            "extraction": extraction,
            "file_info": file_info,
            "total_pages": total_pages,
        }

    def get_full_text_from_extraction(self, extraction):
        """
        Get full text from extraction result.

        Args:
            extraction: Extraction result dict

        Returns:
            str: Full document text
        """
        return get_full_text(extraction)
← Übersicht Graph