Pārlūkot izejas kodu

ocr_construct_page_component 位置移动

赵小蒙 1 gadu atpakaļ
vecāks
revīzija
fc10772503

+ 1 - 1
magic_pdf/pdf_parse_by_model.py

@@ -53,7 +53,7 @@ from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
 from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
 from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
 from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
-from magic_pdf.pre_proc.construct_paras import construct_page_component
+from magic_pdf.pre_proc.construct_page_dict import construct_page_component
 from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
 from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
 from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block

+ 2 - 23
magic_pdf/pdf_parse_by_ocr.py

@@ -18,6 +18,7 @@ from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.para.para_split import para_split
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
 from magic_pdf.pre_proc.detect_header import parse_headers
@@ -33,28 +34,6 @@ from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remo
 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 
 
-def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                             images, tables, interline_equations, inline_equations,
-                             dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
-                             need_remove_spans_bboxes_dict):
-    return_dict = {
-        'preproc_blocks': blocks,
-        'layout_bboxes': layout_bboxes,
-        'page_idx': page_id,
-        'page_size': [page_w, page_h],
-        '_layout_tree': layout_tree,
-        'images': images,
-        'tables': tables,
-        'interline_equations': interline_equations,
-        'inline_equations': inline_equations,
-        'droped_text_block': dropped_text_block,
-        'droped_image_block': dropped_image_block,
-        'droped_table_block': dropped_table_block,
-        'dropped_equation_block': dropped_equation_block,
-        'droped_bboxes': need_remove_spans_bboxes_dict,
-    }
-    return return_dict
-
 
 def parse_pdf_by_ocr(
         pdf_path,
@@ -254,7 +233,7 @@ def parse_pdf_by_ocr(
                 dropped_equation_block.append(span)
 
         '''构造pdf_info_dict'''
-        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+        page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                                              images, tables, interline_equations, inline_equations,
                                              dropped_text_block, dropped_image_block, dropped_table_block,
                                              dropped_equation_block,

+ 1 - 1
magic_pdf/pdf_parse_for_train.py

@@ -75,7 +75,7 @@ from magic_pdf.pre_proc.equations_replace import (
 )
 from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
 from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
-from magic_pdf.pre_proc.construct_paras import construct_page_component
+from magic_pdf.pre_proc.construct_page_dict import construct_page_component
 from magic_pdf.pre_proc.fix_image import (
     combine_images,
     fix_image_vertical,

+ 23 - 0
magic_pdf/pre_proc/construct_paras.py → magic_pdf/pre_proc/construct_page_dict.py

@@ -28,3 +28,26 @@ def construct_page_component(page_id, image_info, table_info,  text_blocks_prepr
     return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
     
     return return_dict
+
+
+def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                             images, tables, interline_equations, inline_equations,
+                             dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
+                             need_remove_spans_bboxes_dict):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'inline_equations': inline_equations,
+        'droped_text_block': dropped_text_block,
+        'droped_image_block': dropped_image_block,
+        'droped_table_block': dropped_table_block,
+        'dropped_equation_block': dropped_equation_block,
+        'droped_bboxes': need_remove_spans_bboxes_dict,
+    }
+    return return_dict