Browse Source

元素类型引用统一定义

xuchao 1 year ago
parent
commit
83753cbd77
3 changed files with 23 additions and 16 deletions
  1. 4 4
      demo/ocr_demo.py
  2. 5 1
      magic_pdf/dict2md/mkcontent.py
  3. 14 11
      magic_pdf/para/para_split.py

+ 4 - 4
demo/ocr_demo.py

@@ -30,13 +30,13 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
     # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
     # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
     
-    # ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
-    # ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
+    ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
+    ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
     try:
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)

+ 5 - 1
magic_pdf/dict2md/mkcontent.py

@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
         if content_type == "text":
             content_md.append(c.get("text"))
         elif content_type == "equation":
-            content_md.append(f"$$\n{c.get('latex')}\n$$")
+            content = c.get("latex")
+            if content.startswith("$$") and content.endswith("$$"):
+                content_md.append(content)
+            else:
+                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
         elif content_type in UNI_FORMAT_TEXT_TYPE:
             content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
         elif content_type == "image":

+ 14 - 11
magic_pdf/para/para_split.py

@@ -3,11 +3,12 @@ import numpy as np
 from loguru import logger
 
 from magic_pdf.libs.boxbase import _is_in
+from magic_pdf.libs.ocr_content_type import ContentType
 
 
 LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"]
-INLINE_EQUATION = 'inline_equation'
-INTER_EQUATION = "displayed_equation"
+INLINE_EQUATION = ContentType.InlineEquation
+INTERLINE_EQUATION = ContentType.InterlineEquation
 TEXT = "text"
 
 def __add_line_period(blocks, layout_bboxes):
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
         for line in block['lines']:
             last_span = line['spans'][-1]
             span_type = last_span['type']
-            if span_type in [TEXT, INLINE_EQUATION]:
+            if span_type in [INLINE_EQUATION]:
                 span_content = last_span['content'].strip()
                 if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG:
-                    if span_type in [INLINE_EQUATION, INTER_EQUATION]:
+                    if span_type in [INLINE_EQUATION, INTERLINE_EQUATION]:
                         last_span['content'] = span_content + '.'
 
 
 
 def __valign_lines(blocks, layout_bboxes):
     """
-    对齐行的左侧和右侧。
-    扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
-    3是个经验值,TODO,计算得来
-    
+    在一个layoutbox内对齐行的左侧和右侧。
+    扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
+    3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
     """
     
     min_distance = 3
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
                 else: 
                     para.append(line)
             else: # 其他,图片、表格、行间公式,各自占一段
-                para.append(line)
-                paras.append(para)
+                if len(para)>0:
+                    paras.append(para)
+                    para = []
+                else:
+                    paras.append([line])
+                    para = []
                 # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
                 # logger.info(para_text)
-                para = []
         if len(para)>0:
             paras.append(para)
             # para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])