Răsfoiți Sursa

ocr_mk_mm_markdown_with_para和ocr_mk_mm_markdown_with_para_and_pagination逻辑优化

赵小蒙 1 an în urmă
părinte
comite
7d010e1969
1 a modificat fișierele cu 25 adăugiri și 38 ștergeri
  1. 25 38
      magic_pdf/dict2md/ocr_mkcontent.py

+ 25 - 38
magic_pdf/dict2md/ocr_mkcontent.py

@@ -72,53 +72,18 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
     markdown = []
     for _, page_info in pdf_info_dict.items():
         paras_of_layout = page_info.get("para_blocks")
-        if not paras_of_layout:
-            continue
-        for paras in paras_of_layout:
-            for para in paras:
-                para_text = ''
-                for line in para:
-                    for span in line['spans']:
-                        span_type = span.get('type')
-                        if span_type == ContentType.Text:
-                            content = split_long_words(span['content'])
-                            pass
-                        elif span_type == ContentType.InlineEquation:
-                            content = f" ${span['content']}$ "
-                        elif span_type == ContentType.InterlineEquation:
-                            content = f"\n$$\n{span['content']}\n$$\n"
-                        elif span_type in [ ContentType.Image, ContentType.Table ]:
-                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
-                        para_text += content + ' '
-                markdown.append(para_text.strip() + '  ')
-
+        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout)
+        markdown.extend(page_markdown)
     return '\n\n'.join(markdown)
 
 
 def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
     markdown_with_para_and_pagination = []
     for page_no, page_info in pdf_info_dict.items():
-        page_markdown = []
         paras_of_layout = page_info.get("para_blocks")
         if not paras_of_layout:
             continue
-        for paras in paras_of_layout:
-            for para in paras:
-                para_text = ''
-                for line in para:
-                    for span in line['spans']:
-                        span_type = span.get('type')
-                        if span_type == ContentType.Text:
-                            content = split_long_words(span['content'])
-                            # content = span['content']
-                        elif span_type == ContentType.InlineEquation:
-                            content = f"${span['content']}$"
-                        elif span_type == ContentType.InterlineEquation:
-                            content = f"\n$$\n{span['content']}\n$$\n"
-                        elif span_type in [ContentType.Image, ContentType.Table]:
-                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
-                        para_text += content + ' '
-                page_markdown.append(para_text.strip() + '  ')
+        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout)
         markdown_with_para_and_pagination.append({
             'page_no': page_no,
             'md_content': '\n\n'.join(page_markdown)
@@ -126,6 +91,28 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
     return markdown_with_para_and_pagination
 
 
+def ocr_mk_mm_markdown_with_para_core(paras_of_layout):
+    page_markdown = []
+    for paras in paras_of_layout:
+        for para in paras:
+            para_text = ''
+            for line in para:
+                for span in line['spans']:
+                    span_type = span.get('type')
+                    if span_type == ContentType.Text:
+                        content = split_long_words(span['content'])
+                        # content = span['content']
+                    elif span_type == ContentType.InlineEquation:
+                        content = f"${span['content']}$"
+                    elif span_type == ContentType.InterlineEquation:
+                        content = f"\n$$\n{span['content']}\n$$\n"
+                    elif span_type in [ContentType.Image, ContentType.Table]:
+                        content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
+                    para_text += content + ' '
+            page_markdown.append(para_text.strip() + '  ')
+    return page_markdown
+
+
 def make_standard_format_with_para(pdf_info_dict: dict):
     content_list = []
     for _, page_info in pdf_info_dict.items():