Ver código fonte

Merge branch 'master' of github.com:papayalove/Magic-PDF

# Conflicts:
#	magic_pdf/model/magic_model.py
liukaiwen 1 ano atrás
pai
commit
8fb6403688

+ 3 - 2
magic_pdf/cli/magicpdf.py

@@ -99,11 +99,12 @@ def json_command(json, method):
         s3_rw = S3ReaderWriter(
             s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
         )
-        may_range_params = parse_s3_range_params(json)
+        may_range_params = parse_s3_range_params(s3path)
         if may_range_params is None or 2 != len(may_range_params):
             byte_start, byte_end = 0, None
         else:
             byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
         return s3_rw.read_jsonl(
             remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
         )
@@ -143,7 +144,7 @@ def pdf_command(pdf, model, method):
         model = pdf.replace(".pdf", ".json")
         if not os.path.exists(model):
             print(f"make sure json file existed and place under {os.dirname(pdf)}")
-            os.eixt(1)
+            os.exit(1)
 
     def read_fn(path):
         disk_rw = DiskReaderWriter(os.path.dirname(path))

+ 40 - 1
magic_pdf/libs/boxbase.py

@@ -1,7 +1,7 @@
 
 
 from loguru import logger
-
+import math
 
 def _is_in_or_part_overlap(box1, box2) -> bool:
     """
@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
         return right_boxes[0]
     else:
         return None
+
+
+def bbox_relative_pos(bbox1, bbox2):
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left = x2b < x1
+    right = x1b < x2
+    bottom = y2b < y1
+    top = y1b < y2
+    return left, right, bottom, top
+    
+def bbox_distance(bbox1, bbox2):
+    def dist(point1, point2):
+            return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
+    
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+    
+    if top and left:
+        return dist((x1, y1b), (x2b, y2))
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b))
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b))
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2))
+    elif left:
+        return x1 - x2b
+    elif right:
+        return x2 - x1b
+    elif bottom:
+        return y1 - y2b
+    elif top:
+        return y2 - y1b
+    else:             # rectangles intersect
+        return 0

+ 3 - 3
magic_pdf/libs/coordinate_transform.py

@@ -1,9 +1,9 @@
-def get_scale_ratio(ocr_page_info, page):
+def get_scale_ratio(model_page_info, page):
     pix = page.get_pixmap(dpi=72)
     pymu_width = int(pix.w)
     pymu_height = int(pix.h)
-    width_from_json = ocr_page_info['page_info']['width']
-    height_from_json = ocr_page_info['page_info']['height']
+    width_from_json = model_page_info['page_info']['width']
+    height_from_json = model_page_info['page_info']['height']
     horizontal_scale_ratio = width_from_json / pymu_width
     vertical_scale_ratio = height_from_json / pymu_height
     return horizontal_scale_ratio, vertical_scale_ratio

+ 5 - 0
magic_pdf/libs/math.py

@@ -0,0 +1,5 @@
+def float_gt(a, b):
+    if 0.0001 >= abs(a -b):
+        return False
+    return a > b
+    

+ 13 - 0
magic_pdf/libs/ocr_content_type.py

@@ -4,4 +4,17 @@ class ContentType:
     Text = "text"
     InlineEquation = "inline_equation"
     InterlineEquation = "interline_equation"
+    
+class BlockType:
+    Image = "image"
+    ImageBody = "image_body"
+    ImageCaption = "image_caption"
+    Table = "table"
+    TableBody = "table_body"
+    TableCaption = "table_caption"
+    TableFootnote = "table_footnote"
+    Text = "text"
+    Title = "title"
+    InterlineEquation = "interline_equation"
+    Footnote = "footnote"
 

+ 418 - 0
magic_pdf/model/magic_model.py

@@ -464,3 +464,421 @@ if __name__ == "__main__":
         magic_model = MagicModel(model_list, pdf_docs)
         for i in range(7):
             print(magic_model.get_imgs(i))
+
+
+    def __reduct_overlap(self, bboxes):
+        N = len(bboxes)
+        keep = [True] * N
+        for i in range(N):
+            for j in range(N):
+                if i == j:
+                    continue
+                if _is_in(bboxes[i], bboxes[j]):
+                    keep[i] = False
+
+        return [bboxes[i] for i in range(N) if keep[i]]
+
+    def __tie_up_category_by_distance(
+        self, page_no, subject_category_id, object_category_id
+    ):
+        """
+        假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
+        """
+        ret = []
+        MAX_DIS_OF_POINT = 10**9 + 7
+
+        subjects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: x["bbox"],
+                    filter(
+                        lambda x: x["category_id"] == subject_category_id,
+                        self.__model_list[page_no]["layout_dets"],
+                    ),
+                )
+            )
+        )
+
+        objects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: x["bbox"],
+                    filter(
+                        lambda x: x["category_id"] == object_category_id,
+                        self.__model_list[page_no]["layout_dets"],
+                    ),
+                )
+            )
+        )
+        subject_object_relation_map = {}
+
+        subjects.sort(key=lambda x: x[0] ** 2 + x[1] ** 2)  # get the distance !
+
+        all_bboxes = []
+
+        for v in subjects:
+            all_bboxes.append({"category_id": subject_category_id, "bbox": v})
+
+        for v in objects:
+            all_bboxes.append({"category_id": object_category_id, "bbox": v})
+
+        N = len(all_bboxes)
+        dis = [[MAX_DIS_OF_POINT] * N for _ in range(N)]
+
+        for i in range(N):
+            for j in range(i):
+                if (
+                    all_bboxes[i]["category_id"] == subject_category_id
+                    and all_bboxes[j]["category_id"] == subject_category_id
+                ):
+                    continue
+
+                dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"])
+                dis[j][i] = dis[i][j]
+
+        used = set()
+        for i in range(N):
+            # 求第 i 个 subject 所关联的 object
+            if all_bboxes[i]["category_id"] != subject_category_id:
+                continue
+            seen = set()
+            candidates = []
+            arr = []
+            for j in range(N):
+
+                pos_flag_count = sum(
+                    list(
+                        map(
+                            lambda x: 1 if x else 0,
+                            bbox_relative_pos(
+                                all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                            ),
+                        )
+                    )
+                )
+                if pos_flag_count > 1:
+                    continue
+                if (
+                    all_bboxes[j]["category_id"] != object_category_id
+                    or j in used
+                    or dis[i][j] == MAX_DIS_OF_POINT
+                ):
+                    continue
+                arr.append((dis[i][j], j))
+
+            arr.sort(key=lambda x: x[0])
+            if len(arr) > 0:
+                candidates.append(arr[0][1])
+                seen.add(arr[0][1])
+
+            # 已经获取初始种子
+            for j in set(candidates):
+                tmp = []
+                for k in range(i + 1, N):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    all_bboxes[j]["bbox"], all_bboxes[k]["bbox"]
+                                ),
+                            )
+                        )
+                    )
+
+                    if pos_flag_count > 1:
+                        continue
+
+                    if (
+                        all_bboxes[k]["category_id"] != object_category_id
+                        or k in used
+                        or k in seen
+                        or dis[j][k] == MAX_DIS_OF_POINT
+                    ):
+                        continue
+                    is_nearest = True
+                    for l in range(i + 1, N):
+                        if l in (j, k) or l in used or l in seen:
+                            continue
+
+
+                        if not float_gt(dis[l][k], dis[j][k]):
+                            is_nearest = False
+                            break
+
+
+                    if is_nearest:
+                        tmp.append(k)
+                        seen.add(k)
+
+                candidates = tmp
+                if len(candidates) == 0:
+                    break
+
+            # 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
+            # 先扩一下 bbox,
+            x0s = [all_bboxes[idx]["bbox"][0] for idx in seen] + [
+                all_bboxes[i]["bbox"][0]
+            ]
+            y0s = [all_bboxes[idx]["bbox"][1] for idx in seen] + [
+                all_bboxes[i]["bbox"][1]
+            ]
+            x1s = [all_bboxes[idx]["bbox"][2] for idx in seen] + [
+                all_bboxes[i]["bbox"][2]
+            ]
+            y1s = [all_bboxes[idx]["bbox"][3] for idx in seen] + [
+                all_bboxes[i]["bbox"][3]
+            ]
+
+            ox0, oy0, ox1, oy1 = min(x0s), min(y0s), max(x1s), max(y1s)
+            ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
+
+            # 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
+            caption_poses = [
+                [ox0, oy0, ix0, oy1],
+                [ox0, oy0, ox1, iy0],
+                [ox0, iy1, ox1, oy1],
+                [ix1, oy0, ox1, oy1],
+            ]
+
+            caption_areas = []
+            for bbox in caption_poses:
+                embed_arr = []
+                for idx in seen:
+                    if _is_in(all_bboxes[idx]["bbox"], bbox):
+                        embed_arr.append(idx)
+
+                if len(embed_arr) > 0:
+                    embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr])
+                    embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr])
+                    embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr])
+                    embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr])
+                    caption_areas.append(
+                        int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
+                    )
+                else:
+                    caption_areas.append(0)
+
+            subject_object_relation_map[i] = []
+            if max(caption_areas) > 0:
+                max_area_idx = caption_areas.index(max(caption_areas))
+                caption_bbox = caption_poses[max_area_idx]
+
+                for j in seen:
+                    if _is_in(all_bboxes[j]["bbox"], caption_bbox):
+                        used.add(j)
+                        subject_object_relation_map[i].append(j)
+
+        for i in sorted(subject_object_relation_map.keys()):
+            result = {
+                "subject_body": all_bboxes[i]["bbox"],
+                "all": all_bboxes[i]["bbox"],
+            }
+
+            if len(subject_object_relation_map[i]) > 0:
+                x0 = min(
+                    [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]]
+                )
+                y0 = min(
+                    [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]]
+                )
+                x1 = max(
+                    [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]]
+                )
+                y1 = max(
+                    [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]]
+                )
+                result["object_body"] = [x0, y0, x1, y1]
+                result["all"] = [
+                    min(x0, all_bboxes[i]["bbox"][0]),
+                    min(y0, all_bboxes[i]["bbox"][1]),
+                    max(x1, all_bboxes[i]["bbox"][2]),
+                    max(y1, all_bboxes[i]["bbox"][3]),
+                ]
+            ret.append(result)
+
+        total_subject_object_dis = 0
+        # 计算已经配对的 distance 距离
+        for i in subject_object_relation_map.keys():
+            for j in subject_object_relation_map[i]:
+                total_subject_object_dis += bbox_distance(
+                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                )
+
+        # 计算未匹配的 subject 和 object 的距离(非精确版)
+        with_caption_subject = set(
+            [
+                key
+                for key in subject_object_relation_map.keys()
+                if len(subject_object_relation_map[i]) > 0
+            ]
+        )
+        for i in range(N):
+            if all_bboxes[i]["category_id"] != object_category_id or i in used:
+                continue
+            candidates = []
+            for j in range(N):
+                if (
+                    all_bboxes[j]["category_id"] != subject_category_id
+                    or j in with_caption_subject
+                ):
+                    continue
+                candidates.append((dis[i][j], j))
+            if len(candidates) > 0:
+                candidates.sort(key=lambda x: x[0])
+                total_subject_object_dis += candidates[0][1]
+                with_caption_subject.add(j)
+        return ret, total_subject_object_dis
+
+    def get_imgs(self, page_no: int):  # @许瑞
+        records, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
+        return [
+            {
+                "bbox": record["all"],
+                "img_body_bbox": record["subject_body"],
+                "img_caption_bbox": record.get("object_body", None),
+            }
+            for record in records
+        ]
+
+    def get_tables(
+        self, page_no: int
+    ) -> list:  # 3个坐标, caption, table主体,table-note
+        with_captions, _ = self.__tie_up_category_by_distance(page_no, 5, 6)
+        with_footnotes, _ = self.__tie_up_category_by_distance(page_no, 5, 7)
+        ret = []
+        N, M = len(with_captions), len(with_footnotes)
+        assert N == M
+        for i in range(N):
+            record = {
+                "table_caption_bbox": with_captions[i].get("object_body", None),
+                "table_body_bbox": with_captions[i]["subject_body"],
+                "table_footnote_bbox": with_footnotes[i].get("object_body", None),
+            }
+
+            x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0])
+            y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1])
+            x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2])
+            y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3])
+            record["bbox"] = [x0, y0, x1, y1]
+            ret.append(record)
+        return ret
+
+    def get_equations(self, page_no: int) -> list:  # 有坐标,也有字
+        inline_equations = self.__get_blocks_by_type(ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"])
+        interline_equations = self.__get_blocks_by_type(ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"])
+        interline_equations_blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no)
+        return inline_equations, interline_equations, interline_equations_blocks
+
+    def get_discarded(self, page_no: int) -> list:  # 自研模型,只有坐标
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no)
+        return blocks
+
+    def get_text_blocks(self, page_no: int) -> list:  # 自研模型搞的,只有坐标,没有字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no)
+        return blocks
+
+    def get_title_blocks(self, page_no: int) -> list:  # 自研模型,只有坐标,没字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no)
+        return blocks
+
+    def get_ocr_text(self, page_no: int) -> list:  # paddle 搞的,有字也有坐标
+        text_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            if layout_det["category_id"] == "15":
+                span = {
+                    "bbox": layout_det['bbox'],
+                    "content": layout_det["text"],
+                }
+                text_spans.append(span)
+        return text_spans
+
+    def get_all_spans(self, page_no: int) -> list:
+        all_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info["layout_dets"]
+        allow_category_id_list = [3, 5, 13, 14, 15]
+        """当成span拼接的"""
+        #  3: 'image', # 图片
+        #  4: 'table',       # 表格
+        #  13: 'inline_equation',     # 行内公式
+        #  14: 'interline_equation',      # 行间公式
+        #  15: 'text',      # ocr识别文本
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            if category_id in allow_category_id_list:
+                span = {
+                    "bbox": layout_det['bbox']
+                }
+                if category_id == 3:
+                    span["type"] = ContentType.Image
+                elif category_id == 5:
+                    span["type"] = ContentType.Table
+                elif category_id == 13:
+                    span["content"] = layout_det["latex"]
+                    span["type"] = ContentType.InlineEquation
+                elif category_id == 14:
+                    span["content"] = layout_det["latex"]
+                    span["type"] = ContentType.InterlineEquation
+                elif category_id == 15:
+                    span["content"] = layout_det["text"]
+                    span["type"] = ContentType.Text
+                all_spans.append(span)
+        return all_spans
+
+    def get_page_size(self, page_no: int):  # 获取页面宽高
+        # 获取当前页的page对象
+        page = self.__docs[page_no]
+        # 获取当前页的宽高
+        page_w = page.rect.width
+        page_h = page.rect.height
+        return page_w, page_h
+
+    def __get_blocks_by_type(self, types: list, page_no: int, extra_col: list[str] = []) -> list:
+        blocks = []
+        for page_dict in self.__model_list:
+            layout_dets = page_dict.get("layout_dets", [])
+            page_info = page_dict.get("page_info", {})
+            page_number = page_info.get("page_no", -1)
+            if page_no != page_number:
+                continue
+            for item in layout_dets:
+                category_id = item.get("category_id", -1)
+                bbox = item.get("bbox", None)
+
+                if category_id in types:
+                    block = {
+                        "bbox": bbox
+                    }
+                    for col in extra_col:
+                        block[col] = item.get(col, None)
+                    blocks.append(block)
+        return blocks
+
+if __name__ == "__main__":
+    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+    if 0:
+        pdf_file_path = r"linshixuqiu\19983-00.pdf"
+        model_file_path = r"linshixuqiu\19983-00_new.json"
+        pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
+        model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
+        model_list = json.loads(model_json_txt)
+        write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
+        img_bucket_path = "imgs"
+        img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
+        pdf_docs = fitz.open("pdf", pdf_bytes)
+        magic_model = MagicModel(model_list, pdf_docs)
+
+    if 1:
+        model_list = json.loads(
+            drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json")
+        )
+        pdf_bytes = drw.read(
+            "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN
+        )
+        pdf_docs = fitz.open("pdf", pdf_bytes)
+        magic_model = MagicModel(model_list, pdf_docs)
+        for i in range(7):
+            print(magic_model.get_imgs(i))

+ 0 - 2
magic_pdf/para/para_split.py

@@ -341,8 +341,6 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
     """
     if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
         return layout_paras, [False, False]
-    # if page_num==343:
-    #     pass
         
     for i in range(1, len(layout_paras)):
         pre_layout_list_info = layout_list_info[i-1]

+ 101 - 0
magic_pdf/parse_by_ocr_v2.py

@@ -0,0 +1,101 @@
+import time
+
+from loguru import logger
+
+from magic_pdf.layout.layout_sort import get_bboxes_layout
+from magic_pdf.libs.convert_utils import dict_to_list
+from magic_pdf.libs.hash_utils import compute_md5
+from magic_pdf.libs.commons import fitz, get_delta_time
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
+from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans
+from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2
+
+
+def parse_pdf_by_ocr(pdf_bytes,
+                     model_list,
+                     imageWriter,
+                     start_page_id=0,
+                     end_page_id=None,
+                     debug_mode=False,
+                     ):
+    pdf_bytes_md5 = compute_md5(pdf_bytes)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+
+    '''初始化空的pdf_info_dict'''
+    pdf_info_dict = {}
+
+    '''用model_list和docs对象初始化magic_model'''
+    magic_model = MagicModel(model_list, pdf_docs)
+
+    '''根据输入的起始范围解析pdf'''
+    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+
+    '''初始化启动时间'''
+    start_time = time.time()
+
+    for page_id in range(start_page_id, end_page_id + 1):
+
+        '''debug时输出每页解析的耗时'''
+        if debug_mode:
+            time_now = time.time()
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
+            start_time = time_now
+
+        '''从magic_model对象中获取后面会用到的区块信息'''
+        img_blocks = magic_model.get_imgs(page_id)
+        table_blocks = magic_model.get_tables(page_id)
+        discarded_blocks = magic_model.get_discarded(page_id)
+        text_blocks = magic_model.get_text_blocks(page_id)
+        title_blocks = magic_model.get_title_blocks(page_id)
+        inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
+
+        page_w, page_h = magic_model.get_page_size(page_id)
+
+        '''将所有区块的bbox整理到一起'''
+        all_bboxes = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equation_blocks, page_w, page_h)
+
+        '''根据区块信息计算layout'''
+        page_boundry = [0, 0, page_w, page_h]
+        layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
+
+        '''根据layout顺序,对当前页面所有需要留下的block进行排序'''
+        sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
+
+        '''获取所有需要拼接的span资源'''
+        spans = magic_model.get_all_spans(page_id)
+        '''删除重叠spans中较小的那些'''
+        spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+        '''对image和table截图'''
+        spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
+
+        '''将span填入排好序的blocks中'''
+        block_with_spans = fill_spans_in_blocks(sorted_blocks, spans)
+
+        '''对block进行fix操作'''
+        fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
+
+        '''获取QA需要外置的list'''
+        images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
+
+        '''构造pdf_info_dict'''
+        page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                                    images, tables, interline_equations, discarded_blocks)
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    """分段"""
+    pass
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict

+ 0 - 3
magic_pdf/pdf_parse_by_ocr.py

@@ -160,9 +160,6 @@ def parse_pdf_by_ocr(
         '''bbox去除粘连'''
         spans = remove_overlap_between_bbox(spans)
 
-        '''用现有的bbox计算layout'''
-
-
         '''
         对tpye=["interline_equation", "image", "table"]进行额外处理,
         如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0

+ 9 - 9
magic_pdf/pipe/UNIPipe.py

@@ -1,21 +1,17 @@
 import json
 
 from loguru import logger
-from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
-from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
-from magic_pdf.filter.pdf_classify_by_type import classify
-from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
 from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug: bool = False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
+                 is_debug: bool = False):
         self.pdf_type = self.PIP_OCR
         super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
 
@@ -24,9 +20,11 @@ class UNIPipe(AbsPipe):
 
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
-            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+                                                is_debug=self.is_debug)
         elif self.pdf_type == self.PIP_OCR:
-            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+                                              is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self):
         content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
@@ -36,6 +34,7 @@ class UNIPipe(AbsPipe):
         markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return markdown_content
 
+
 if __name__ == '__main__':
     # 测试
     drw = DiskReaderWriter(r"D:/project/20231108code-clean")
@@ -60,5 +59,6 @@ if __name__ == '__main__':
 
     md_writer = DiskReaderWriter(write_path)
     md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
-    md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json", AbsReaderWriter.MODE_TXT)
+    md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json",
+                    AbsReaderWriter.MODE_TXT)
     md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)

+ 28 - 11
magic_pdf/pre_proc/construct_page_dict.py

@@ -1,12 +1,13 @@
-
-def construct_page_component(page_id, image_info, table_info,  text_blocks_preproc, layout_bboxes, inline_eq_info, interline_eq_info, raw_pymu_blocks, 
-                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,layout_tree,
+def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
+                             interline_eq_info, raw_pymu_blocks,
+                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
+                             layout_tree,
                              page_w, page_h, footnote_bboxes_tmp):
     """
     
     """
     return_dict = {}
-    
+
     return_dict['para_blocks'] = {}
     return_dict['preproc_blocks'] = text_blocks_preproc
     return_dict['images'] = image_info
@@ -16,24 +17,24 @@ def construct_page_component(page_id, image_info, table_info,  text_blocks_prepr
     return_dict['layout_bboxes'] = layout_bboxes
     return_dict['pymu_raw_blocks'] = raw_pymu_blocks
     return_dict['global_statistic'] = {}
-    
+
     return_dict['droped_text_block'] = removed_text_blocks
     return_dict['droped_image_block'] = removed_image_blocks
     return_dict['droped_table_block'] = []
     return_dict['image_backup'] = images_backup
-    return_dict['table_backup'] = []    
+    return_dict['table_backup'] = []
     return_dict['page_idx'] = page_id
     return_dict['page_size'] = [page_w, page_h]
-    return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
+    return_dict['_layout_tree'] = layout_tree  # 辅助分析layout作用
     return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
-    
+
     return return_dict
 
 
 def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                             images, tables, interline_equations, inline_equations,
-                             dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
-                             need_remove_spans_bboxes_dict):
+                                 images, tables, interline_equations, inline_equations,
+                                 dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
+                                 need_remove_spans_bboxes_dict):
     return_dict = {
         'preproc_blocks': blocks,
         'layout_bboxes': layout_bboxes,
@@ -51,3 +52,19 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,
         'droped_bboxes': need_remove_spans_bboxes_dict,
     }
     return return_dict
+
+
+def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                    images, tables, interline_equations, droped_blocks):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'droped_blocks': droped_blocks,
+    }
+    return return_dict

+ 35 - 0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -0,0 +1,35 @@
+from magic_pdf.libs.ocr_content_type import BlockType
+
+
+def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
+                                        title_blocks, interline_equation_blocks, page_w, page_h):
+    all_bboxes = []
+
+    for image in img_blocks:
+        x0, y0, x1, y1 = image['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
+
+    for table in table_blocks:
+        x0, y0, x1, y1 = table['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
+
+    for text in text_blocks:
+        x0, y0, x1, y1 = text['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
+
+    for title in title_blocks:
+        x0, y0, x1, y1 = title['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
+
+    for interline_equation in interline_equation_blocks:
+        x0, y0, x1, y1 = interline_equation['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
+
+    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
+
+    return all_bboxes
+

+ 86 - 4
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -3,7 +3,8 @@ from loguru import logger
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
     calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
 
 
 # 将每一个line中的span从左到右排序
@@ -24,6 +25,7 @@ def line_sort_spans_by_left_to_right(lines):
         })
     return line_objects
 
+
 def merge_spans_to_line(spans):
     if len(spans) == 0:
         return []
@@ -37,7 +39,8 @@ def merge_spans_to_line(spans):
             # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
             # image和table类型,同上
             if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
+                    current_line):
                 # 则开始新行
                 lines.append(current_line)
                 current_line = [span]
@@ -57,6 +60,7 @@ def merge_spans_to_line(spans):
 
         return lines
 
+
 def merge_spans_to_line_by_layout(spans, layout_bboxes):
     lines = []
     new_spans = []
@@ -103,7 +107,85 @@ def merge_lines_to_block(lines):
     return blocks
 
 
+def sort_blocks_by_layout(all_bboxes, layout_bboxes):
+    new_blocks = []
+    sort_blocks = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
 
+        # 遍历blocks,将每个blocks放入对应的layout中
+        layout_blocks = []
+        for block in all_bboxes:
+            # 如果是footnote则跳过
+            if block[7] == BlockType.Footnote:
+                continue
+            block_bbox = [block[0], block[1], block[2], block[3]]
+            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
+                layout_blocks.append(block)
 
-
-
+        # 如果layout_blocks不为空,则放入new_blocks中
+        if len(layout_blocks) > 0:
+            new_blocks.append(layout_blocks)
+            # 从spans删除已经放入layout_sapns中的span
+            for layout_block in layout_blocks:
+                all_bboxes.remove(layout_block)
+
+    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
+    if len(new_blocks) > 0:
+        for bboxes_in_layout_block in new_blocks:
+            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
+            sort_blocks.extend(bboxes_in_layout_block)
+
+    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
+    return sort_blocks
+
+
+def fill_spans_in_blocks(blocks, spans):
+    '''
+    将allspans中的span按位置关系,放入blocks中
+    '''
+    block_with_spans = []
+    for block in blocks:
+        block_type = block[7]
+        block_bbox = block[0:4]
+        block_dict = {
+            'block_type': block_type,
+            'bbox': block_bbox,
+        }
+        block_spans = []
+        for span in spans:
+            span_bbox = span['bbox']
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
+                block_spans.append(span)
+        block_dict['spans'] = block_spans
+        block_with_spans.append(block_dict)
+
+        # 从spans删除已经放入block_spans中的span
+        if len(block_spans) > 0:
+            for span in block_spans:
+                spans.remove(span)
+
+    return block_with_spans
+
+
+def fix_block_spans(block_with_spans, img_blocks, table_blocks):
+    '''
+    1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
+        需要将caption和footnote的text_span放入相应img_block和table_block内的
+        caption_block和footnote_block中
+    2、同时需要删除block中的spans字段
+    '''
+    fix_blocks = []
+    for block in block_with_spans:
+        block_type = block['block_type']
+
+        if block_type == BlockType.Image:
+            block = fix_image_block(block, img_blocks)
+        elif block_type == BlockType.Table:
+            block = fix_table_block(block, table_blocks)
+        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+            block = fix_text_block(block)
+        else:
+            continue
+        fix_blocks.append(block)
+    return fix_blocks

+ 113 - 0
magic_pdf/pre_proc/ocr_fix_block_logic.py

@@ -0,0 +1,113 @@
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, line_sort_spans_by_left_to_right
+
+
+def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
+    block_spans = []
+    # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
+    for span in spans:
+        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
+            block_spans.append(span)
+    block_lines = merge_spans_to_line(block_spans)
+    # 对line中的span进行排序
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block = {
+        'bbox': block_bbox,
+        'block_type': block_type,
+        'lines': sort_block_lines
+    }
+    return block, block_spans
+
+
+def make_body_block(span: dict, block_bbox: list, block_type: str):
+    # 创建body_block
+    body_line = {
+        'bbox': block_bbox,
+        'spans': [span],
+    }
+    body_block = {
+        'bbox': block_bbox,
+        'block_type': block_type,
+        'lines': [body_line]
+    }
+    return body_block
+
+
+def fix_image_block(block, img_blocks):
+    block['blocks'] = []
+    # 遍历img_blocks,找到与当前block匹配的img_block
+    for img_block in img_blocks:
+        if img_block['bbox'] == block['bbox']:
+            # 创建img_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
+                    # 创建img_body_block
+                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
+                    block['blocks'].append(img_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度,判断img_block中是否有img_caption
+            if len(img_block['img_caption_bbox']) > 0:
+                img_caption_block, img_caption_spans = merge_spans_to_block(
+                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
+                )
+                block['blocks'].append(img_caption_block)
+
+            break
+    del block['spans']
+    return block
+
+
+def fix_table_block(block, table_blocks):
+    block['blocks'] = []
+    # 遍历table_blocks,找到与当前block匹配的table_block
+    for table_block in table_blocks:
+        if table_block['bbox'] == block['bbox']:
+            # 创建table_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
+                    # 创建table_body_block
+                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
+                    block['blocks'].append(table_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度,判断table_block中是否有caption
+            if len(table_block['table_caption_bbox']) > 0:
+                table_caption_block, table_caption_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
+                )
+                block['blocks'].append(table_caption_block)
+
+                # 如果table_caption_block_spans不为空
+                if len(table_caption_spans) > 0:
+                    #  一些span已经放入了caption_block中,需要从block['spans']中删除
+                    for span in table_caption_spans:
+                        block['spans'].remove(span)
+
+            # 根据list长度,判断table_block中是否有table_note
+            if len(table_block['table_footnote_bbox']) > 0:
+                table_footnote_block, table_footnote_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
+                )
+                block['blocks'].append(table_footnote_block)
+
+            break
+    del block['spans']
+    return block
+
+
+def fix_text_block(block):
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+

+ 25 - 7
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -3,7 +3,7 @@ from loguru import logger
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
     __is_overlaps_y_exceeds_threshold
 from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 
 
 def remove_overlaps_min_spans(spans):
@@ -50,7 +50,8 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
                     need_remove_spans.append(span)
                     break
                 # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
-                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1]+span['bbox'][3])/2 > removed_bbox[3] and removed_bbox[0] < (span['bbox'][0]+span['bbox'][2])/2 < removed_bbox[2]:
+                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
+                        removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
                     need_remove_spans.append(span)
                     break
 
@@ -162,9 +163,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
             text_line = text_inline_lines[j]
             y0, y1 = text_line[1]
             if (
-                    span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
-                span['bbox'], (0, y0, 0, y1)):
-
+                    span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
+            ) and __is_overlaps_y_exceeds_threshold(
+                span['bbox'], (0, y0, 0, y1)
+            ):
                 # 调整公式类型
                 if span["type"] == ContentType.InterlineEquation:
                     # 最后一行是行间公式
@@ -181,8 +183,8 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                             span["bbox"][1] = y0
                             span["bbox"][3] = y1
                 break
-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
-                                                                                                       (0, y0, 0, y1)):
+            elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                (0, y0, 0, y1)):
                 break
             else:
                 j += 1
@@ -211,3 +213,19 @@ def get_qa_need_list(blocks):
                 else:
                     continue
     return images, tables, interline_equations, inline_equations
+
+
+def get_qa_need_list_v2(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+
+    for block in blocks:
+        if block["type"] == BlockType.Image:
+            images.append(block)
+        elif block["type"] == BlockType.Table:
+            tables.append(block)
+        elif block["type"] == BlockType.InterlineEquation:
+            interline_equations.append(block)
+    return images, tables, interline_equations