Эх сурвалжийг харах

ocr增加分页markdown输出格式

赵小蒙 1 жил өмнө
parent
commit
e9aa103cae

+ 30 - 0
magic_pdf/dict2md/ocr_mkcontent.py

@@ -94,6 +94,36 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
     return '\n\n'.join(markdown)
 
 
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
+    markdown_with_para_and_pagination = []
+    for page_no, page_info in pdf_info_dict.items():
+        page_markdown = []
+        paras = page_info.get("para_blocks")
+        if not paras:
+            continue
+        for para in paras:
+            para_text = ''
+            for line in para:
+                for span in line['spans']:
+                    span_type = span.get('type')
+                    if span_type == ContentType.Text:
+                        content = split_long_words(span['content'])
+                        # content = span['content']
+                    elif span_type == ContentType.InlineEquation:
+                        content = f"${span['content']}$"
+                    elif span_type == ContentType.InterlineEquation:
+                        content = f"\n$$\n{span['content']}\n$$\n"
+                    elif span_type in [ContentType.Image, ContentType.Table]:
+                        content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
+                    para_text += content + ' '
+            page_markdown.append(para_text.strip() + '  ')
+        markdown_with_para_and_pagination.append({
+            'page_no': page_no,
+            'md': '\n\n'.join(page_markdown)
+        })
+    return markdown_with_para_and_pagination
+
+
 def make_standard_format_with_para(pdf_info_dict: dict):
     content_list = []
     for _, page_info in pdf_info_dict.items():

+ 30 - 1
magic_pdf/pipeline.py

@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
     ocr_mk_nlp_markdown,
     ocr_mk_mm_markdown,
     ocr_mk_mm_standard_format,
-    ocr_mk_mm_markdown_with_para,
+    ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination,
 )
 from magic_pdf.libs.commons import (
     read_file,
@@ -525,6 +525,35 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
     return jso
 
 
+def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
+
+    if debug_mode:
+        pass
+    else:  # 如果debug没开,则检测是否有needdrop字段
+        if jso.get("need_drop", False):
+            book_name = join_path(get_data_source(jso), jso["file_id"])
+            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
+            jso["dropped"] = True
+            return jso
+    try:
+        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
+        # 将 pdf_intermediate_dict 解压
+        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
+        markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
+        jso["content"] = markdown_content
+        logger.info(
+            f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
+            file=sys.stderr,
+        )
+        # 把无用的信息清空
+        # jso["doc_layout_result"] = ""
+        jso["pdf_intermediate_dict"] = ""
+        # jso["pdf_meta"] = ""
+    except Exception as e:
+        jso = exception_handler(jso, e)
+    return jso
+
+
 def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
     jso: dict, debug_mode=False
 ) -> dict: