|
|
@@ -185,7 +185,8 @@ class EnhancedDocPipeline:
|
|
|
def process_document(
|
|
|
self,
|
|
|
document_path: str,
|
|
|
- page_range: Optional[str] = None
|
|
|
+ page_range: Optional[str] = None,
|
|
|
+ output_dir: Optional[str] = None
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
处理文档主流程
|
|
|
@@ -233,7 +234,8 @@ class EnhancedDocPipeline:
|
|
|
page_idx=page_idx,
|
|
|
pdf_type=pdf_type,
|
|
|
pdf_doc=pdf_doc,
|
|
|
- page_name=page_name
|
|
|
+ page_name=page_name,
|
|
|
+ output_dir=output_dir,
|
|
|
)
|
|
|
results['pages'].append(page_result)
|
|
|
|
|
|
@@ -252,13 +254,14 @@ class EnhancedDocPipeline:
|
|
|
raise
|
|
|
|
|
|
def _process_single_page(
|
|
|
- self,
|
|
|
- image_dict: Dict[str, Any],
|
|
|
- page_idx: int,
|
|
|
- pdf_type: str,
|
|
|
- pdf_doc: Optional[Any] = None,
|
|
|
- page_name: Optional[str] = None
|
|
|
- ) -> Dict[str, Any]:
|
|
|
+ self,
|
|
|
+ image_dict: Dict[str, Any],
|
|
|
+ page_idx: int,
|
|
|
+ pdf_type: str,
|
|
|
+ pdf_doc: Optional[Any] = None,
|
|
|
+ page_name: Optional[str] = None,
|
|
|
+ output_dir: Optional[str] = None
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
"""
|
|
|
处理单页文档
|
|
|
|
|
|
@@ -351,9 +354,6 @@ class EnhancedDocPipeline:
|
|
|
logger.info(f"📝 Page {page_idx}: OCR detected {len(all_ocr_spans)} text spans")
|
|
|
except Exception as e:
|
|
|
logger.warning(f"⚠️ Full-page OCR failed: {e}")
|
|
|
-
|
|
|
- skew_angle = BBoxExtractor.calculate_skew_angle(all_ocr_spans)
|
|
|
- logger.info(f"📊 Wired table skew angle: {skew_angle:.3f}°")
|
|
|
|
|
|
# 4. 将 OCR spans 匹配到 layout blocks
|
|
|
matched_spans = SpanMatcher.match_spans_to_blocks(
|
|
|
@@ -372,7 +372,9 @@ class EnhancedDocPipeline:
|
|
|
page_idx=page_idx,
|
|
|
scale=scale,
|
|
|
matched_spans=matched_spans,
|
|
|
- layout_results=layout_results
|
|
|
+ layout_results=layout_results,
|
|
|
+ output_dir=output_dir,
|
|
|
+ basename=page_name
|
|
|
)
|
|
|
|
|
|
# 7. 按阅读顺序排序
|
|
|
@@ -512,7 +514,9 @@ class EnhancedDocPipeline:
|
|
|
page_idx: int,
|
|
|
scale: float,
|
|
|
matched_spans: Optional[Dict[int, List[Dict[str, Any]]]] = None,
|
|
|
- layout_results: Optional[List[Dict[str, Any]]] = None
|
|
|
+ layout_results: Optional[List[Dict[str, Any]]] = None,
|
|
|
+ output_dir: Optional[str] = None,
|
|
|
+ basename: Optional[str] = None,
|
|
|
) -> tuple:
|
|
|
"""
|
|
|
处理所有分类后的元素
|
|
|
@@ -600,7 +604,8 @@ class EnhancedDocPipeline:
|
|
|
# 有线表格路径:UNet 识别
|
|
|
logger.info(f"🔷 Using wired UNet table recognition (configured)")
|
|
|
element = self.element_processors.process_table_element_wired(
|
|
|
- detection_image, item, scale, pre_matched_spans=spans
|
|
|
+ detection_image, item, scale, pre_matched_spans=spans,
|
|
|
+ output_dir=output_dir, basename=basename
|
|
|
)
|
|
|
|
|
|
# 如果有线识别失败(返回空 HTML),fallback 到 VLM
|