Browse Source

重构目录结构

赵小蒙 1 năm trước cách đây
mục cha
commit
11e4e8cc59

+ 0 - 0
download.py → demo/download.py


+ 0 - 0
draw_bbox.py → demo/draw_bbox.py


+ 4 - 5
pdf2md.py → demo/pdf2md.py

@@ -3,12 +3,11 @@ import sys
 from pathlib import Path
 
 import click
-import json
 from loguru import logger
 
-from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
-from mkcontent import mk_mm_markdown, mk_nlp_markdown
-from pdf_parse_by_model import parse_pdf_by_model
+from libs.commons import join_path
+from dict2md.mkcontent import mk_mm_markdown
+from pipeline.pdf_parse_by_model import parse_pdf_by_model
 
 
 
@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
     pth = Path(s3_pdf_path)
     book_name = pth.name
     # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
-    save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "..","tmp", "unittest") 
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
     save_path = join_path(save_tmp_path, "md")
     text_content_save_path = f"{save_path}/{book_name}/book.md"
     # metadata_save_path = f"{save_path}/{book_name}/metadata.json"

+ 1 - 9
s3pdf2md.py → demo/s3pdf2md.py

@@ -1,17 +1,9 @@
-import os
-import sys
 from pathlib import Path
 
 import click
 import json
-from loguru import logger
-
-from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
-from mkcontent import mk_nlp_markdown
-from pdf2md import main
-from pdf_parse_by_model import parse_pdf_by_model
-
 
+from demo.pdf2md import main
 
 
 @click.command()

+ 0 - 0
__init__.py → dict2md/__init__.py


+ 0 - 0
mkcontent.py → dict2md/mkcontent.py


+ 3 - 9
pdf2json_infer.py

@@ -1,7 +1,6 @@
 import sys
 from typing import Tuple
 import os
-import click
 import boto3, json
 from botocore.config import Config
 from libs.commons import fitz
@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
 # from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
 
 from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
-from pdf2text_recogFigure import parse_images          # 获取figures的bbox
+from pre_proc.detect_images import parse_images          # 获取figures的bbox
 from pdf2text_recogTable import parse_tables           # 获取tables的bbox
-from pdf2text_recogEquation import parse_equations     # 获取equations的bbox
-from pdf2text_recogTitle import parse_titles           # 获取titles的bbox
-from pdf2text_recogHeader import parse_headers         # 获取headers的bbox
-from pdf2text_recogPageNo import parse_pageNos         # 获取pageNos的bbox
+from pre_proc.detect_equation import parse_equations     # 获取equations的bbox
 # from pdf2text_recogFootnote import parse_footnotes     # 获取footnotes的bbox
-from pdf2text_recogFooter import parse_footers         # 获取footers的bbox
-from pdf2text_evaluatePdfLayout import evaluate_pdf_layout # 评估页面的Layout是否是规整的。
-from pdf2text_recogPara import process_blocks_per_page, postprocess_paras_pipeline
+from pdf2text_recogPara import process_blocks_per_page
 from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
 
 

+ 11 - 11
pdf_parse_by_model.py → pipeline/pdf_parse_by_model.py

@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason
 from libs.markdown_utils import escape_special_markdown_char
 from libs.safe_filename import sanitize_filename
 from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
-from pdf2text_recogFigure import parse_images
-from pdf2text_recogFootnoteLine import remove_headder_footer_one_page  # 获取figures的bbox
+from pre_proc.detect_images import parse_images
 from pdf2text_recogTable import parse_tables  # 获取tables的bbox
-from pdf2text_recogEquation import parse_equations  # 获取equations的bbox
+from pre_proc.detect_equation import parse_equations  # 获取equations的bbox
 from pdf2text_recogHeader import parse_headers  # 获取headers的bbox
 from pdf2text_recogPageNo import parse_pageNos  # 获取pageNos的bbox
-from pdf2text_recogFootnote import parse_footnotes_by_model, parse_footnotes_by_rule  # 获取footnotes的bbox
-from pdf2text_recogFooter import parse_footers  # 获取footers的bbox
+from pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule  # 获取footnotes的bbox
+from pre_proc.detect_footer_by_model import parse_footers  # 获取footers的bbox
 
 from pdf2text_recogPara import (
     ParaProcessPipeline,
@@ -34,6 +33,7 @@ from pdf2text_recogPara import (
 )
 from pre_proc.main_text_font import get_main_text_font
 from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
+from pre_proc.remove_footer_header import remove_headder_footer_one_page
 
 '''
 from para.para_pipeline import ParaProcessPipeline
@@ -48,17 +48,17 @@ from para.exceptions import (
 
 from libs.commons import read_file, join_path
 from libs.pdf_image_tools import save_images_by_bboxes
-from post_proc.footnote_remove import merge_footnote_blocks, remove_footnote_blocks
+from post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
 from pre_proc.citationmarker_remove import remove_citation_marker
 from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
-from pre_proc.pdf_filter import pdf_filter
-from pre_proc.detect_footer_header import drop_footer_header
+from pre_proc.pdf_pre_filter import pdf_filter
+from pre_proc.detect_footer_header_by_statistics import drop_footer_header
 from pre_proc.construct_paras import construct_page_component
-from pre_proc.image_fix import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
+from pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
 from post_proc.pdf_post_filter import pdf_post_filter
 from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
 from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
-from pre_proc.table_fix import fix_table_text_block, fix_tables, include_table_title
+from pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
 
 denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
 titleDetectionException_msg = TitleDetectionException().message
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
     debug_mode=False,
 ):
     pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
-    save_tmp_path = os.path.join(os.path.dirname(__file__), "..", "tmp", "unittest")
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
     md_bookname_save_path = ""
     book_name = sanitize_filename(book_name)
     if debug_mode:

+ 39 - 2
post_proc/footnote_remove.py → post_proc/remove_footnote.py

@@ -1,5 +1,4 @@
-from libs.boxbase import _is_in
-from pdf2text_recogFootnoteLine import remove_footnote_text, remove_footnote_image
+from libs.boxbase import _is_in, _is_in_or_part_overlap
 import collections      # 统计库
 
 
@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info):
         del page_info['merged_bboxes']
     del page_info['footnote_bboxes_tmp']
     return page_info
+
+
+def remove_footnote_text(raw_text_block, footnote_bboxes):
+    """
+    :param raw_text_block: str类型,是当前页的文本内容
+    :param footnoteBboxes: list类型,是当前页的脚注bbox
+    """
+    footnote_text_blocks = []
+    for block in raw_text_block:
+        text_bbox = block['bbox']
+        # TODO 更严谨点在line级别做
+        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
+            block['tag'] = 'footnote'
+            footnote_text_blocks.append(block)
+            # raw_text_block.remove(block)
+
+    # 移除,不能再内部移除,否则会出错
+    for block in footnote_text_blocks:
+        raw_text_block.remove(block)
+
+    return raw_text_block, footnote_text_blocks
+
+
+def remove_footnote_image(image_blocks, footnote_bboxes):
+    """
+    :param image_bboxes: list类型,是当前页的图片bbox(结构体)
+    :param footnoteBboxes: list类型,是当前页的脚注bbox
+    """
+    footnote_imgs_blocks = []
+    for image_block in image_blocks:
+        if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            footnote_imgs_blocks.append(image_block)
+
+    for footnote_imgs_block in footnote_imgs_blocks:
+        image_blocks.remove(footnote_imgs_block)
+
+    return image_blocks, footnote_imgs_blocks

+ 0 - 0
pdf2text_recogEquation.py → pre_proc/detect_equation.py


+ 0 - 0
pdf2text_recogFooter.py → pre_proc/detect_footer_by_model.py


+ 0 - 0
pre_proc/detect_footer_header.py → pre_proc/detect_footer_header_by_statistics.py


+ 0 - 0
pdf2text_recogFootnote.py → pre_proc/detect_footnote.py


+ 0 - 0
pdf2text_recogFigure.py → pre_proc/detect_images.py


+ 0 - 0
pre_proc/image_fix.py → pre_proc/fix_image.py


+ 0 - 0
pre_proc/table_fix.py → pre_proc/fix_table.py


+ 0 - 0
pre_proc/pdf_filter.py → pre_proc/pdf_pre_filter.py


+ 116 - 0
pre_proc/remove_footer_header.py

@@ -0,0 +1,116 @@
+import re
+
+from libs.boxbase import _is_in_or_part_overlap
+
+
+def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
+                                   page_no_bboxs, page_w, page_h):
+    """
+    删除页眉页脚,页码
+    从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
+    """
+    header = []
+    footer = []
+    if len(header) == 0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x, _, _, _ in model_header])
+            y0 = min([y for _, y, _, _ in model_header])
+            x1 = max([x1 for _, _, x1, _ in model_header])
+            y1 = max([y1 for _, _, _, y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer) == 0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x, _, _, _ in model_footer])
+            y0 = min([y for _, y, _, _ in model_footer])
+            x1 = max([x1 for _, _, x1, _ in model_footer])
+            y1 = max([y1 for _, _, _, y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
+
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+
+    content_boundry = [0, header_y0, page_w, footer_y0]
+
+    header = [0, 0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+
+    """以上计算出来了页眉页脚的边界,下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk['lines']) > 0:
+            for line in blk['lines']:
+                line_del = []
+                for span in line['spans']:
+                    span_del = []
+                    if span['bbox'][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line['spans'].remove(span)
+                if not line['spans']:
+                    line_del.append(line)
+
+            for line in line_del:
+                blk['lines'].remove(line)
+        else:
+            # if not blk['lines']:
+            blk['tag'] = 'in-foot-header-area'
+            text_block_to_remove.append(blk)
+
+    """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(pagenobox, block['bbox']):  # 在span级别删除页码
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
+                                # span['text'] = ''
+                                span['tag'] = "page-no"
+                                # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
+                                if len(line['spans']) == 1 and len(block['lines']) == 1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block['lines']) == 1:
+                last_line = last_block['lines'][0]
+                if len(last_line['spans']) == 1:
+                    last_span = last_line['spans'][0]
+                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
+                                                                                                                last_span[
+                                                                                                                    'text']):
+                        last_span['tag'] = "page-no"
+                        page_no_block_2_remove.append(last_block)
+
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+
+    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+
+    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove