1 жил өмнө · e9aa103cae
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -94,6 +94,36 @@ def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
 
				     return '\n\n'.join(markdown)
			
 
				 
			
 
				 
			
 
				+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
			
 
				+    markdown_with_para_and_pagination = []
			
 
				+    for page_no, page_info in pdf_info_dict.items():
			
 
				+        page_markdown = []
			
 
				+        paras = page_info.get("para_blocks")
			
 
				+        if not paras:
			
 
				+            continue
			
 
				+        for para in paras:
			
 
				+            para_text = ''
			
 
				+            for line in para:
			
 
				+                for span in line['spans']:
			
 
				+                    span_type = span.get('type')
			
 
				+                    if span_type == ContentType.Text:
			
 
				+                        content = split_long_words(span['content'])
			
 
				+                        # content = span['content']
			
 
				+                    elif span_type == ContentType.InlineEquation:
			
 
				+                        content = f"${span['content']}$"
			
 
				+                    elif span_type == ContentType.InterlineEquation:
			
 
				+                        content = f"\n$$\n{span['content']}\n$$\n"
			
 
				+                    elif span_type in [ContentType.Image, ContentType.Table]:
			
 
				+                        content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
			
 
				+                    para_text += content + ' '
			
 
				+            page_markdown.append(para_text.strip() + '  ')
			
 
				+        markdown_with_para_and_pagination.append({
			
 
				+            'page_no': page_no,
			
 
				+            'md': '\n\n'.join(page_markdown)
			
 
				+        })
			
 
				+    return markdown_with_para_and_pagination
			
 
				+
			
 
				+
			
 
				 def make_standard_format_with_para(pdf_info_dict: dict):
			
 
				     content_list = []
			
 
				     for _, page_info in pdf_info_dict.items():
			
--- a/magic_pdf/pipeline.py
+++ b/magic_pdf/pipeline.py
@@ -7,7 +7,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
 
				     ocr_mk_nlp_markdown,
			
 
				     ocr_mk_mm_markdown,
			
 
				     ocr_mk_mm_standard_format,
			
 
				-    ocr_mk_mm_markdown_with_para,
			
 
				+    ocr_mk_mm_markdown_with_para, ocr_mk_mm_markdown_with_para_and_pagination,
			
 
				 )
			
 
				 from magic_pdf.libs.commons import (
			
 
				     read_file,
			
@@ -525,6 +525,35 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
 
				     return jso
			
 
				 
			
 
				 
			
 
				+def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
			
 
				+
			
 
				+    if debug_mode:
			
 
				+        pass
			
 
				+    else:  # 如果debug没开，则检测是否有needdrop字段
			
 
				+        if jso.get("need_drop", False):
			
 
				+            book_name = join_path(get_data_source(jso), jso["file_id"])
			
 
				+            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
			
 
				+            jso["dropped"] = True
			
 
				+            return jso
			
 
				+    try:
			
 
				+        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
			
 
				+        # 将 pdf_intermediate_dict 解压
			
 
				+        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
			
 
				+        markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
			
 
				+        jso["content"] = markdown_content
			
 
				+        logger.info(
			
 
				+            f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
			
 
				+            file=sys.stderr,
			
 
				+        )
			
 
				+        # 把无用的信息清空
			
 
				+        # jso["doc_layout_result"] = ""
			
 
				+        jso["pdf_intermediate_dict"] = ""
			
 
				+        # jso["pdf_meta"] = ""
			
 
				+    except Exception as e:
			
 
				+        jso = exception_handler(jso, e)
			
 
				+    return jso
			
 
				+
			
 
				+
			
 
				 def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
			
 
				     jso: dict, debug_mode=False
			
 
				 ) -> dict: