1 年之前 · d6a5724b26
--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
@@ -166,45 +166,65 @@ def mk_mm_markdown_1(para_dict: dict):
 
				     return content_text
			
 
				 
			
 
				 
			
 
				-def __insert_after_para(text, image_path, content_list):
			
 
				+def __insert_after_para(text, type, element, content_list):
			
 
				     """
			
 
				     在content_list中找到text，将image_path作为一个新的node插入到text后面
			
 
				     """
			
 
				     for i, c in enumerate(content_list):
			
 
				         content_type = c.get("type")
			
 
				         if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
			
 
				-            img_node = {
			
 
				-                "type": "image",
			
 
				-                "img_path": image_path,
			
 
				-                "img_alt":"",
			
 
				-                "img_title":"",
			
 
				-                "img_caption":""
			
 
				-            }
			
 
				-            content_list.insert(i+1, img_node)
			
 
				+            if type == "image":
			
 
				+                content_node = {
			
 
				+                    "type": "image",
			
 
				+                    "img_path": element.get("image_path"),
			
 
				+                    "img_alt": "",
			
 
				+                    "img_title": "",
			
 
				+                    "img_caption": "",
			
 
				+                }
			
 
				+            elif type == "table":
			
 
				+                content_node = {
			
 
				+                    "type": "table",
			
 
				+                    "img_path": element.get("image_path"),
			
 
				+                    "table_latex": element.get("text"),
			
 
				+                    "table_title": "",
			
 
				+                    "table_caption": "",
			
 
				+                    "table_quality": element.get("quality"),
			
 
				+                }
			
 
				+            content_list.insert(i+1, content_node)
			
 
				             break
			
 
				     else:
			
 
				-        logger.error(f"Can't find the location of image {image_path} in the markdown file, search target is {text}")
			
 
				+        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
			
 
				     
			
 
				 
			
 
				 
			
 
				-def __insert_before_para(text, image_path, content_list):
			
 
				+def __insert_before_para(text, type, element, content_list):
			
 
				     """
			
 
				     在content_list中找到text，将image_path作为一个新的node插入到text前面
			
 
				     """
			
 
				     for i, c in enumerate(content_list):
			
 
				         content_type = c.get("type")
			
 
				         if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
			
 
				-            img_node = {
			
 
				-                "type": "image",
			
 
				-                "img_path": image_path,
			
 
				-                "img_alt":"",
			
 
				-                "img_title":"",
			
 
				-                "img_caption":""
			
 
				-            }
			
 
				-            content_list.insert(i, img_node)
			
 
				+            if type == "image":
			
 
				+                content_node = {
			
 
				+                    "type": "image",
			
 
				+                    "img_path": element.get("image_path"),
			
 
				+                    "img_alt": "",
			
 
				+                    "img_title": "",
			
 
				+                    "img_caption": "",
			
 
				+                }
			
 
				+            elif type == "table":
			
 
				+                content_node = {
			
 
				+                    "type": "table",
			
 
				+                    "img_path": element.get("image_path"),
			
 
				+                    "table_latex": element.get("text"),
			
 
				+                    "table_title": "",
			
 
				+                    "table_caption": "",
			
 
				+                    "table_quality": element.get("quality"),
			
 
				+                }
			
 
				+            content_list.insert(i, content_node)
			
 
				             break
			
 
				     else:
			
 
				-        logger.error(f"Can't find the location of image {image_path} in the markdown file, search target is {text}")
			
 
				+        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
			
 
				          
			
 
				 
			
 
				 def mk_universal_format(para_dict: dict):
			
@@ -220,9 +240,11 @@ def mk_universal_format(para_dict: dict):
 
				         all_page_images = []
			
 
				         all_page_images.extend(page_info.get("images",[]))
			
 
				         all_page_images.extend(page_info.get("image_backup", []) )
			
 
				-        all_page_images.extend(page_info.get("tables",[]))
			
 
				-        all_page_images.extend(page_info.get("table_backup",[]) )
			
 
				-        
			
 
				+        # all_page_images.extend(page_info.get("tables",[]))
			
 
				+        # all_page_images.extend(page_info.get("table_backup",[]) )
			
 
				+        all_page_tables = []
			
 
				+        all_page_tables.extend(page_info.get("tables", []))
			
 
				+
			
 
				         if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
			
 
				             for img in all_page_images:
			
 
				                 content_node = {
			
@@ -233,6 +255,16 @@ def mk_universal_format(para_dict: dict):
 
				                     "img_caption":""
			
 
				                 }
			
 
				                 page_lst.append(content_node) # TODO 图片顺序
			
 
				+            for table in all_page_tables:
			
 
				+                content_node = {
			
 
				+                    "type": "table",
			
 
				+                    "img_path": table['image_path'],
			
 
				+                    "table_latex": table.get("text"),
			
 
				+                    "table_title": "",
			
 
				+                    "table_caption": "",
			
 
				+                    "table_quality": table.get("quality"),
			
 
				+                }
			
 
				+                page_lst.append(content_node) # TODO 图片顺序
			
 
				         else:
			
 
				             for block in para_blocks:
			
 
				                 item = block["paras"]
			
@@ -266,56 +298,65 @@ def mk_universal_format(para_dict: dict):
 
				         
			
 
				         """插入图片"""
			
 
				         for img in all_page_images:
			
 
				-            imgbox = img['bbox']
			
 
				-            img_content = f"{img['image_path']}"
			
 
				-            # 先看在哪个block内
			
 
				-            for block in pymu_raw_blocks:
			
 
				-                bbox = block['bbox']
			
 
				-                if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在这个大的block内，然后进入逐行比较距离
			
 
				-                    for l in block['lines']:
			
 
				-                        line_box = l['bbox']
			
 
				-                        if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
			
 
				-                            line_txt = "".join([s['text'] for s in l['spans']])
			
 
				-                            __insert_before_para(line_txt, img_content, content_lst)
			
 
				-                            break
			
 
				-                        break
			
 
				-                    else:# 在行与行之间
			
 
				-                        # 找到图片x0,y0与line的x0,y0最近的line
			
 
				-                        min_distance = 100000
			
 
				-                        min_line = None
			
 
				-                        for l in block['lines']:
			
 
				-                            line_box = l['bbox']
			
 
				-                            distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
			
 
				-                            if distance < min_distance:
			
 
				-                                min_distance = distance
			
 
				-                                min_line = l
			
 
				-                        if min_line:
			
 
				-                            line_txt = "".join([s['text'] for s in min_line['spans']])
			
 
				-                            img_h = imgbox[3] - imgbox[1]
			
 
				-                            if min_distance<img_h: # 文字在图片前面
			
 
				-                                __insert_after_para(line_txt, img_content, content_lst)
			
 
				-                            else:
			
 
				-                                __insert_before_para(line_txt, img_content, content_lst) 
			
 
				-                            break
			
 
				-                        else:
			
 
				-                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
			
 
				-            else:# 应当在两个block之间
			
 
				-                # 找到上方最近的block，如果上方没有就找大下方最近的block
			
 
				-                top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
			
 
				-                if top_txt_block:
			
 
				-                    line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
			
 
				-                    __insert_after_para(line_txt, img_content, content_lst) 
			
 
				-                else:
			
 
				-                    bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
			
 
				-                    if bottom_txt_block:
			
 
				-                        line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
			
 
				-                        __insert_before_para(line_txt, img_content, content_lst) 
			
 
				-                    else: # TODO ，图片可能独占一列，这种情况上下是没有图片的
			
 
				-                        logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
			
 
				+            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
			
 
				+
			
 
				+        """插入表格"""
			
 
				+        for table in all_page_tables:
			
 
				+            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
			
 
				     # end for
			
 
				     return content_lst
			
 
				 
			
 
				 
			
 
				+def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
			
 
				+    element_bbox = element['bbox']
			
 
				+    # 先看在哪个block内
			
 
				+    for block in pymu_raw_blocks:
			
 
				+        bbox = block['bbox']
			
 
				+        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
			
 
				+            3] + 1:  # 确定在这个大的block内，然后进入逐行比较距离
			
 
				+            for l in block['lines']:
			
 
				+                line_box = l['bbox']
			
 
				+                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
			
 
				+                    3] + 1:  # 在line内的，插入line前面
			
 
				+                    line_txt = "".join([s['text'] for s in l['spans']])
			
 
				+                    __insert_before_para(line_txt, type, element, content_lst)
			
 
				+                    break
			
 
				+                break
			
 
				+            else:  # 在行与行之间
			
 
				+                # 找到图片x0,y0与line的x0,y0最近的line
			
 
				+                min_distance = 100000
			
 
				+                min_line = None
			
 
				+                for l in block['lines']:
			
 
				+                    line_box = l['bbox']
			
 
				+                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
			
 
				+                    if distance < min_distance:
			
 
				+                        min_distance = distance
			
 
				+                        min_line = l
			
 
				+                if min_line:
			
 
				+                    line_txt = "".join([s['text'] for s in min_line['spans']])
			
 
				+                    img_h = element_bbox[3] - element_bbox[1]
			
 
				+                    if min_distance < img_h:  # 文字在图片前面
			
 
				+                        __insert_after_para(line_txt, type, element, content_lst)
			
 
				+                    else:
			
 
				+                        __insert_before_para(line_txt, type, element, content_lst)
			
 
				+                    break
			
 
				+                else:
			
 
				+                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
			
 
				+    else:  # 应当在两个block之间
			
 
				+        # 找到上方最近的block，如果上方没有就找大下方最近的block
			
 
				+        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
			
 
				+        if top_txt_block:
			
 
				+            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
			
 
				+            __insert_after_para(line_txt, type, element, content_lst)
			
 
				+        else:
			
 
				+            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
			
 
				+            if bottom_txt_block:
			
 
				+                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
			
 
				+                __insert_before_para(line_txt, type, element, content_lst)
			
 
				+            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
			
 
				+                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
			
 
				+
			
 
				+
			
 
				 def mk_mm_markdown(content_list):
			
 
				     """
			
 
				     基于同一格式的内容列表，构造markdown，含图片
			
@@ -348,6 +389,8 @@ def mk_nlp_markdown(content_list):
 
				             content_md.append(c.get("text"))
			
 
				         elif content_type == "equation":
			
 
				             content_md.append(f"$$\n{c.get('latex')}\n$$")
			
 
				+        elif content_type == "table":
			
 
				+            content_md.append(f"$$\n{c.get('table_latex')}\n$$")
			
 
				         elif content_type in UNI_FORMAT_TEXT_TYPE:
			
 
				             content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
			
 
				     return "\n\n".join(content_md)