Forráskód Böngészése

refactor(magic_pdf): remove unused functions and simplify code

myhloli 11 hónapja
szülő
commit
ecdaa49aee
52 módosított fájl, 3 hozzáadás és 64 törlés
  1. 0 0
      magic_pdf/dict2md/mkcontent.py.bak
  2. 0 0
      magic_pdf/layout.bak/__init__.py
  3. 0 0
      magic_pdf/layout.bak/bbox_sort.py
  4. 0 0
      magic_pdf/layout.bak/layout_det_utils.py
  5. 0 0
      magic_pdf/layout.bak/layout_sort.py
  6. 0 0
      magic_pdf/layout.bak/layout_spiler_recog.py
  7. 0 0
      magic_pdf/layout.bak/mcol_sort.py
  8. 0 0
      magic_pdf/libs/calc_span_stats.py.bak
  9. 0 0
      magic_pdf/libs/detect_language_from_model.py.bak
  10. 1 3
      magic_pdf/libs/markdown_utils.py
  11. 0 0
      magic_pdf/libs/nlp_utils.py.bak
  12. 0 0
      magic_pdf/libs/textbase.py.bak
  13. 0 0
      magic_pdf/libs/vis_utils.py.bak
  14. 0 0
      magic_pdf/para/block_continuation_processor.py.bak
  15. 0 0
      magic_pdf/para/block_termination_processor.py.bak
  16. 0 0
      magic_pdf/para/commons.py.bak
  17. 0 0
      magic_pdf/para/denoise.py.bak
  18. 0 0
      magic_pdf/para/draw.py.bak
  19. 0 0
      magic_pdf/para/exceptions.py.bak
  20. 0 0
      magic_pdf/para/layout_match_processor.py.bak
  21. 0 0
      magic_pdf/para/para_split.py.bak
  22. 0 0
      magic_pdf/para/para_split_v2.py.bak
  23. 0 0
      magic_pdf/para/raw_processor.py.bak
  24. 0 0
      magic_pdf/para/stats.py.bak
  25. 0 0
      magic_pdf/para/title_processor.py.bak
  26. 2 61
      magic_pdf/pdf_parse_union_core_v2.py
  27. 0 0
      magic_pdf/post_proc.bak/__init__.py
  28. 0 0
      magic_pdf/post_proc.bak/detect_para.py.bak
  29. 0 0
      magic_pdf/post_proc.bak/pdf_post_filter.py.bak
  30. 0 0
      magic_pdf/post_proc.bak/remove_footnote.py.bak
  31. 0 0
      magic_pdf/pre_proc/citationmarker_remove.py.bak
  32. 0 0
      magic_pdf/pre_proc/detect_equation.py.bak
  33. 0 0
      magic_pdf/pre_proc/detect_footer_by_model.py.bak
  34. 0 0
      magic_pdf/pre_proc/detect_footer_header_by_statistics.py.bak
  35. 0 0
      magic_pdf/pre_proc/detect_footnote.py.bak
  36. 0 0
      magic_pdf/pre_proc/detect_header.py.bak
  37. 0 0
      magic_pdf/pre_proc/detect_images.py.bak
  38. 0 0
      magic_pdf/pre_proc/detect_page_number.py.bak
  39. 0 0
      magic_pdf/pre_proc/detect_tables.py.bak
  40. 0 0
      magic_pdf/pre_proc/equations_replace.py.bak
  41. 0 0
      magic_pdf/pre_proc/fix_image.py.bak
  42. 0 0
      magic_pdf/pre_proc/fix_table.py.bak
  43. 0 0
      magic_pdf/pre_proc/main_text_font.py.bak
  44. 0 0
      magic_pdf/pre_proc/ocr_detect_layout.py.bak
  45. 0 0
      magic_pdf/pre_proc/pdf_pre_filter.py.bak
  46. 0 0
      magic_pdf/pre_proc/post_layout_split.py.bak
  47. 0 0
      magic_pdf/pre_proc/remove_colored_strip_bbox.py.bak
  48. 0 0
      magic_pdf/pre_proc/remove_footer_header.py.bak
  49. 0 0
      magic_pdf/pre_proc/remove_rotate_bbox.py.bak
  50. 0 0
      magic_pdf/pre_proc/resolve_bbox_conflict.py.bak
  51. 0 0
      magic_pdf/pre_proc/solve_line_alien.py.bak
  52. 0 0
      magic_pdf/pre_proc/statistics.py.bak

+ 0 - 0
magic_pdf/dict2md/mkcontent.py → magic_pdf/dict2md/mkcontent.py.bak


+ 0 - 0
magic_pdf/layout/__init__.py → magic_pdf/layout.bak/__init__.py


+ 0 - 0
magic_pdf/layout/bbox_sort.py → magic_pdf/layout.bak/bbox_sort.py


+ 0 - 0
magic_pdf/layout/layout_det_utils.py → magic_pdf/layout.bak/layout_det_utils.py


+ 0 - 0
magic_pdf/layout/layout_sort.py → magic_pdf/layout.bak/layout_sort.py


+ 0 - 0
magic_pdf/layout/layout_spiler_recog.py → magic_pdf/layout.bak/layout_spiler_recog.py


+ 0 - 0
magic_pdf/layout/mcol_sort.py → magic_pdf/layout.bak/mcol_sort.py


+ 0 - 0
magic_pdf/libs/calc_span_stats.py → magic_pdf/libs/calc_span_stats.py.bak


+ 0 - 0
magic_pdf/libs/detect_language_from_model.py → magic_pdf/libs/detect_language_from_model.py.bak


+ 1 - 3
magic_pdf/libs/markdown_utils.py

@@ -1,6 +1,4 @@
-import re
-
-
+@DeprecationWarning
 def escape_special_markdown_char(pymu_blocks):
     """
     转义正文里对markdown语法有特殊意义的字符

+ 0 - 0
magic_pdf/libs/nlp_utils.py → magic_pdf/libs/nlp_utils.py.bak


+ 0 - 0
magic_pdf/libs/textbase.py → magic_pdf/libs/textbase.py.bak


+ 0 - 0
magic_pdf/libs/vis_utils.py → magic_pdf/libs/vis_utils.py.bak


+ 0 - 0
magic_pdf/para/block_continuation_processor.py → magic_pdf/para/block_continuation_processor.py.bak


+ 0 - 0
magic_pdf/para/block_termination_processor.py → magic_pdf/para/block_termination_processor.py.bak


+ 0 - 0
magic_pdf/para/commons.py → magic_pdf/para/commons.py.bak


+ 0 - 0
magic_pdf/para/denoise.py → magic_pdf/para/denoise.py.bak


+ 0 - 0
magic_pdf/para/draw.py → magic_pdf/para/draw.py.bak


+ 0 - 0
magic_pdf/para/exceptions.py → magic_pdf/para/exceptions.py.bak


+ 0 - 0
magic_pdf/para/layout_match_processor.py → magic_pdf/para/layout_match_processor.py.bak


+ 0 - 0
magic_pdf/para/para_split.py → magic_pdf/para/para_split.py.bak


+ 0 - 0
magic_pdf/para/para_split_v2.py → magic_pdf/para/para_split_v2.py.bak


+ 0 - 0
magic_pdf/para/raw_processor.py → magic_pdf/para/raw_processor.py.bak


+ 0 - 0
magic_pdf/para/stats.py → magic_pdf/para/stats.py.bak


+ 0 - 0
magic_pdf/para/title_processor.py → magic_pdf/para/title_processor.py.bak


+ 2 - 61
magic_pdf/pdf_parse_union_core_v2.py

@@ -34,13 +34,11 @@ except ImportError:
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 
 from magic_pdf.para.para_split_v3 import para_split
-from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+
 from magic_pdf.pre_proc.construct_page_dict import \
     ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-from magic_pdf.pre_proc.equations_replace import (
-    combine_chars_to_pymudict, remove_chars_in_text_blocks,
-    replace_equations_in_textblock)
+
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
     ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
@@ -49,26 +47,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
 from magic_pdf.pre_proc.ocr_span_list_modify import (
     get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
     remove_overlaps_min_spans)
-from magic_pdf.pre_proc.resolve_bbox_conflict import \
-    check_useful_block_horizontal_overlap
-
-
-def remove_horizontal_overlap_block_which_smaller(all_bboxes):
-    useful_blocks = []
-    for bbox in all_bboxes:
-        useful_blocks.append({'bbox': bbox[:4]})
-    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
-        check_useful_block_horizontal_overlap(useful_blocks)
-    )
-    if is_useful_block_horz_overlap:
-        logger.warning(
-            f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
-        )  # noqa: E501
-        for bbox in all_bboxes.copy():
-            if smaller_bbox == bbox[:4]:
-                all_bboxes.remove(bbox)
-
-    return is_useful_block_horz_overlap, all_bboxes
 
 
 def __replace_STX_ETX(text_str: str):
@@ -264,39 +242,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
     return spans
 
 
-def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
-    text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
-        'blocks'
-    ]
-    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
-    text_blocks = replace_equations_in_textblock(
-        text_blocks, inline_equations, interline_equations
-    )
-    text_blocks = remove_citation_marker(text_blocks)
-    text_blocks = remove_chars_in_text_blocks(text_blocks)
-    spans = []
-    for v in text_blocks:
-        for line in v['lines']:
-            for span in line['spans']:
-                bbox = span['bbox']
-                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
-                    continue
-                if span.get('type') not in (
-                    ContentType.InlineEquation,
-                    ContentType.InterlineEquation,
-                ):
-                    spans.append(
-                        {
-                            'bbox': list(span['bbox']),
-                            'content': __replace_STX_ETX(span['text']),
-                            'type': ContentType.Text,
-                            'score': 1.0,
-                        }
-                    )
-    return spans
-
-
 def replace_text_span(pymu_spans, ocr_spans):
     return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
 
@@ -722,10 +667,6 @@ def parse_page_core(
     """根据parse_mode,构造spans,主要是文本类的字符填充"""
     if parse_mode == SupportedPdfParseMethod.TXT:
 
-        """之前的公式替换方案"""
-        # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
-        # spans = replace_text_span(pymu_spans, spans)
-
         """使用新版本的混合ocr方案"""
         spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
 

+ 0 - 0
magic_pdf/post_proc/__init__.py → magic_pdf/post_proc.bak/__init__.py


+ 0 - 0
magic_pdf/post_proc/detect_para.py → magic_pdf/post_proc.bak/detect_para.py.bak


+ 0 - 0
magic_pdf/post_proc/pdf_post_filter.py → magic_pdf/post_proc.bak/pdf_post_filter.py.bak


+ 0 - 0
magic_pdf/post_proc/remove_footnote.py → magic_pdf/post_proc.bak/remove_footnote.py.bak


+ 0 - 0
magic_pdf/pre_proc/citationmarker_remove.py → magic_pdf/pre_proc/citationmarker_remove.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_equation.py → magic_pdf/pre_proc/detect_equation.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_footer_by_model.py → magic_pdf/pre_proc/detect_footer_by_model.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_footer_header_by_statistics.py → magic_pdf/pre_proc/detect_footer_header_by_statistics.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_footnote.py → magic_pdf/pre_proc/detect_footnote.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_header.py → magic_pdf/pre_proc/detect_header.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_images.py → magic_pdf/pre_proc/detect_images.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_page_number.py → magic_pdf/pre_proc/detect_page_number.py.bak


+ 0 - 0
magic_pdf/pre_proc/detect_tables.py → magic_pdf/pre_proc/detect_tables.py.bak


+ 0 - 0
magic_pdf/pre_proc/equations_replace.py → magic_pdf/pre_proc/equations_replace.py.bak


+ 0 - 0
magic_pdf/pre_proc/fix_image.py → magic_pdf/pre_proc/fix_image.py.bak


+ 0 - 0
magic_pdf/pre_proc/fix_table.py → magic_pdf/pre_proc/fix_table.py.bak


+ 0 - 0
magic_pdf/pre_proc/main_text_font.py → magic_pdf/pre_proc/main_text_font.py.bak


+ 0 - 0
magic_pdf/pre_proc/ocr_detect_layout.py → magic_pdf/pre_proc/ocr_detect_layout.py.bak


+ 0 - 0
magic_pdf/pre_proc/pdf_pre_filter.py → magic_pdf/pre_proc/pdf_pre_filter.py.bak


+ 0 - 0
magic_pdf/pre_proc/post_layout_split.py → magic_pdf/pre_proc/post_layout_split.py.bak


+ 0 - 0
magic_pdf/pre_proc/remove_colored_strip_bbox.py → magic_pdf/pre_proc/remove_colored_strip_bbox.py.bak


+ 0 - 0
magic_pdf/pre_proc/remove_footer_header.py → magic_pdf/pre_proc/remove_footer_header.py.bak


+ 0 - 0
magic_pdf/pre_proc/remove_rotate_bbox.py → magic_pdf/pre_proc/remove_rotate_bbox.py.bak


+ 0 - 0
magic_pdf/pre_proc/resolve_bbox_conflict.py → magic_pdf/pre_proc/resolve_bbox_conflict.py.bak


+ 0 - 0
magic_pdf/pre_proc/solve_line_alien.py → magic_pdf/pre_proc/solve_line_alien.py.bak


+ 0 - 0
magic_pdf/pre_proc/statistics.py → magic_pdf/pre_proc/statistics.py.bak