Переглянути джерело

中间态dict结构调整
部分函数重构

赵小蒙 1 рік тому
батько
коміт
709a65008a

+ 1 - 1
demo/text_demo.py

@@ -15,7 +15,7 @@ from loguru import logger
 
 from magic_pdf.libs.config_reader import get_s3_config_dict
 from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
-from magic_pdf.spark.base import get_data_source
+from magic_pdf.spark.spark_api import get_data_source
 
 
 def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):

+ 2 - 2
magic_pdf/dict2md/mkcontent.py

@@ -228,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
         logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
          
 
-def mk_universal_format(para_dict: dict, img_buket_path):
+def mk_universal_format(pdf_info_list: list, img_buket_path):
     """
     构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
     """
     content_lst = []
-    for _, page_info in para_dict.items():
+    for page_info in pdf_info_list:
         page_lst = [] # 一个page内的段落列表
         para_blocks = page_info.get("para_blocks")
         pymu_raw_blocks = page_info.get("preproc_blocks")

+ 5 - 5
magic_pdf/dict2md/ocr_mkcontent.py

@@ -69,11 +69,11 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
     return '\n'.join(markdown)
 
 
-def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
+def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
     markdown = []
-    for _, page_info in pdf_info_dict.items():
+    for page_info in pdf_info_list:
         paras_of_layout = page_info.get("para_blocks")
-        page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
+        page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm", img_buket_path)
         markdown.extend(page_markdown)
     return '\n\n'.join(markdown)
 
@@ -100,7 +100,7 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
     return markdown_with_para_and_pagination
 
 
-def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
+def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path):
     page_markdown = []
     for paras in paras_of_layout:
         for para in paras:
@@ -123,7 +123,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
                         content = f"\n$$\n{span['content']}\n$$\n"
                     elif span_type in [ContentType.Image, ContentType.Table]:
                         if mode == 'mm':
-                            content = f"\n![]({span['image_path']})\n"
+                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
                         elif mode == 'nlp':
                             pass
                     if content != '':

+ 5 - 0
magic_pdf/libs/convert_utils.py

@@ -0,0 +1,5 @@
+def dict_to_list(input_dict):
+    items_list = []
+    for _, item in input_dict.items():
+        items_list.append(item)
+    return items_list

+ 1 - 1
magic_pdf/libs/pdf_image_tools.py

@@ -28,7 +28,7 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
 
     byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
 
-    imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
+    imageWriter.write(content=byte_data, path=img_hash256_path, mode="binary")
 
     return img_hash256_path
 

+ 8 - 1
magic_pdf/pdf_parse_by_ocr.py

@@ -5,6 +5,7 @@ from magic_pdf.libs.commons import (
     get_delta_time,
     get_docx_model_output,
 )
+from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.hash_utils import compute_md5
@@ -210,4 +211,10 @@ def parse_pdf_by_ocr(
     """分段"""
     para_split(pdf_info_dict, debug_mode=debug_mode)
 
-    return pdf_info_dict
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict

+ 9 - 1
magic_pdf/pdf_parse_by_txt.py

@@ -11,6 +11,7 @@ from magic_pdf.layout.bbox_sort import (
     prepare_bboxes_for_layout_split,
 )
 from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
+from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.markdown_utils import escape_special_markdown_char
@@ -400,4 +401,11 @@ def parse_pdf_by_txt(
     if error_info is not None:
         return _deal_with_text_exception(error_info)
 
-    return pdf_info_dict
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict

+ 57 - 12
magic_pdf/spark/UNIPipe.py → magic_pdf/pipe/UNIPipe.py

@@ -1,20 +1,26 @@
+import json
+
 from loguru import logger
 
-from magic_pdf.dict2md.mkcontent import mk_universal_format
-from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para
+from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
+from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
 from magic_pdf.filter.pdf_classify_by_type import classify
 from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.detect_language_from_model import get_language_from_model
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
-from magic_pdf.spark.spark_api import parse_union_pdf, parse_ocr_pdf
+from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 
 class UNIPipe:
     def __init__(self):
         pass
 
-    def classify(self, pdf_bytes: bytes) -> str:
+    @staticmethod
+    def classify(pdf_bytes: bytes) -> str:
         """
         根据pdf的元数据,判断是否是文本pdf,还是ocr pdf
         """
@@ -57,25 +63,64 @@ class UNIPipe:
                 pdf_mid_data = parse_ocr_pdf(pdf_bytes, jso_useful_key['model_list'], image_writer)
             else:
                 raise Exception(f"pdf type is not txt or ocr")
-            return JsonCompressor.compress(pdf_mid_data)
+            return JsonCompressor.compress_json(pdf_mid_data)
 
-    def mk_uni_format(self, pdf_mid_data: str, img_buket_path: str) -> list:
+    @staticmethod
+    def mk_uni_format(pdf_mid_data: str, img_buket_path: str) -> list:
         """
         根据pdf类型,生成统一格式content_list
         """
         pdf_mid_data = JsonCompressor.decompress_json(pdf_mid_data)
         parse_type = pdf_mid_data["_parse_type"]
+        pdf_info_list = pdf_mid_data["pdf_info"]
         if parse_type == "txt":
-            content_list = mk_universal_format(pdf_mid_data, img_buket_path)
+            content_list = mk_universal_format(pdf_info_list, img_buket_path)
         elif parse_type == "ocr":
-            content_list = make_standard_format_with_para(pdf_mid_data, img_buket_path)
+            content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
         return content_list
 
+    @staticmethod
+    def mk_markdown(pdf_mid_data: str, img_buket_path: str) -> list:
+        """
+        根据pdf类型,markdown
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(pdf_mid_data)
+        parse_type = pdf_mid_data["_parse_type"]
+        pdf_info_list = pdf_mid_data["pdf_info"]
+        if parse_type == "txt":
+            content_list = mk_universal_format(pdf_info_list, img_buket_path)
+            md_content = mk_mm_markdown(content_list)
+        elif parse_type == "ocr":
+            md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
+        return md_content
+
 
 if __name__ == '__main__':
     # 测试
-    pipe = UNIPipe()
-    pdf_bytes = open(r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\数学新星网\edu_00001544.pdf",
-                     "rb").read()
-    pdf_type = pipe.classify(pdf_bytes)
+    # file_path = r"tmp/unittest/download-pdfs/数学新星网/edu_00001236.pdf"
+    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+    # pdf_bytes = drw.read(path=file_path, mode=AbsReaderWriter.MODE_BIN)
+    # pdf_type = UNIPipe.classify(pdf_bytes)
+    # logger.info(f"pdf_type is {pdf_type}")
+
+    pdf_file_path = r"linshixuqiu\25536-00.pdf"
+    model_file_path = r"linshixuqiu\25536-00.json"
+    pdf_bytes = drw.read(path=pdf_file_path, mode=AbsReaderWriter.MODE_BIN)
+    model_json_txt = drw.read(path=model_file_path, mode=AbsReaderWriter.MODE_TXT)
+
+    pdf_type = UNIPipe.classify(pdf_bytes)
     logger.info(f"pdf_type is {pdf_type}")
+    jso_useful_key = {
+        "_pdf_type": pdf_type,
+        "model_list": json.loads(model_json_txt),
+    }
+    pipe = UNIPipe()
+    write_path = r"D:\project\20231108code-clean\linshixuqiu\25536-00"
+    img_buket_path = "imgs"
+    img_writer = DiskReaderWriter(join_path(write_path, img_buket_path))
+    pdf_mid_data = pipe.parse(pdf_bytes, img_writer, jso_useful_key)
+
+    md_content = pipe.mk_markdown(pdf_mid_data, "imgs")
+    md_writer = DiskReaderWriter(write_path)
+    md_writer.write(content=md_content, path="25536-00.md", mode=AbsReaderWriter.MODE_TXT)
+    md_writer.write(content=json.dumps(JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4), path="25536-00.json", mode=AbsReaderWriter.MODE_TXT)

+ 0 - 0
magic_pdf/pipe/__init__.py


+ 6 - 0
magic_pdf/spark/base.bak

@@ -0,0 +1,6 @@
+from loguru import logger
+
+from magic_pdf.libs.drop_reason import DropReason
+
+
+

+ 0 - 39
magic_pdf/spark/base.py

@@ -1,39 +0,0 @@
-from loguru import logger
-
-from magic_pdf.libs.drop_reason import DropReason
-
-
-def get_data_source(jso: dict):
-    data_source = jso.get("data_source")
-    if data_source is None:
-        data_source = jso.get("file_source")
-    return data_source
-
-
-def get_data_type(jso: dict):
-    data_type = jso.get("data_type")
-    if data_type is None:
-        data_type = jso.get("file_type")
-    return data_type
-
-
-def get_bookid(jso: dict):
-    book_id = jso.get("bookid")
-    if book_id is None:
-        book_id = jso.get("original_file_id")
-    return book_id
-
-
-def exception_handler(jso: dict, e):
-    logger.exception(e)
-    jso["_need_drop"] = True
-    jso["_drop_reason"] = DropReason.Exception
-    jso["_exception"] = f"ERROR: {e}"
-    return jso
-
-
-def get_bookname(jso: dict):
-    data_source = get_data_source(jso)
-    file_id = jso.get("file_id")
-    book_name = f"{data_source}/{file_id}"
-    return book_name

+ 27 - 73
magic_pdf/spark/spark_api.py

@@ -1,88 +1,42 @@
-
-"""
-用户输入:
-    model数组,每个元素代表一个页面
-    pdf在s3的路径
-    截图保存的s3位置
-
-然后:
-    1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
-    2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
-
-其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
-
-"""
 from loguru import logger
 
-from magic_pdf.io import AbsReaderWriter
-from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
-from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
+from magic_pdf.libs.drop_reason import DropReason
 
 
-def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
-    """
-    解析文本类pdf
-    """
-    pdf_info_dict = parse_pdf_by_txt(
-        pdf_bytes,
-        pdf_models,
-        imageWriter,
-        start_page_id=start_page,
-        debug_mode=is_debug,
-    )
+def get_data_source(jso: dict):
+    data_source = jso.get("data_source")
+    if data_source is None:
+        data_source = jso.get("file_source")
+    return data_source
 
-    pdf_info_dict["parse_type"] = "txt"
 
-    return pdf_info_dict
+def get_data_type(jso: dict):
+    data_type = jso.get("data_type")
+    if data_type is None:
+        data_type = jso.get("file_type")
+    return data_type
 
 
-def parse_ocr_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
-    """
-    解析ocr类pdf
-    """
-    pdf_info_dict = parse_pdf_by_ocr(
-        pdf_bytes,
-        pdf_models,
-        imageWriter,
-        start_page_id=start_page,
-        debug_mode=is_debug,
-    )
-
-    pdf_info_dict["_parse_type"] = "ocr"
-
-    return pdf_info_dict
+def get_bookid(jso: dict):
+    book_id = jso.get("bookid")
+    if book_id is None:
+        book_id = jso.get("original_file_id")
+    return book_id
 
 
-def parse_union_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,  *args, **kwargs):
-    """
-    ocr和文本混合的pdf,全部解析出来
-    """
-    def parse_pdf(method):
-        try:
-            return method(
-                pdf_bytes,
-                pdf_models,
-                imageWriter,
-                start_page_id=start_page,
-                debug_mode=is_debug,
-            )
-        except Exception as e:
-            logger.error(f"{method.__name__} error: {e}")
-            return None
-
-    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
+def exception_handler(jso: dict, e):
+    logger.exception(e)
+    jso["_need_drop"] = True
+    jso["_drop_reason"] = DropReason.Exception
+    jso["_exception"] = f"ERROR: {e}"
+    return jso
 
-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
-        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
-        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
-        if pdf_info_dict is None:
-            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
-        else:
-            pdf_info_dict["_parse_type"] = "ocr"
-    else:
-        pdf_info_dict["_parse_type"] = "txt"
 
-    return pdf_info_dict
+def get_bookname(jso: dict):
+    data_source = get_data_source(jso)
+    file_id = jso.get("file_id")
+    book_name = f"{data_source}/{file_id}"
+    return book_name
 
 
 def spark_json_extractor(jso: dict) -> dict:

+ 89 - 0
magic_pdf/user_api.py

@@ -0,0 +1,89 @@
+
+"""
+用户输入:
+    model数组,每个元素代表一个页面
+    pdf在s3的路径
+    截图保存的s3位置
+
+然后:
+    1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
+    2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
+
+其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
+
+"""
+from loguru import logger
+
+from magic_pdf.io import AbsReaderWriter
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
+
+
+def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
+                  **kwargs):
+    """
+    解析文本类pdf
+    """
+    pdf_info_dict = parse_pdf_by_txt(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["parse_type"] = "txt"
+
+    return pdf_info_dict
+
+
+def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
+                  **kwargs):
+    """
+    解析ocr类pdf
+    """
+    pdf_info_dict = parse_pdf_by_ocr(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["_parse_type"] = "ocr"
+
+    return pdf_info_dict
+
+
+def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
+                    *args, **kwargs):
+    """
+    ocr和文本混合的pdf,全部解析出来
+    """
+
+    def parse_pdf(method):
+        try:
+            return method(
+                pdf_bytes,
+                pdf_models,
+                imageWriter,
+                start_page_id=start_page,
+                debug_mode=is_debug,
+            )
+        except Exception as e:
+            logger.error(f"{method.__name__} error: {e}")
+            return None
+
+    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
+
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
+        if pdf_info_dict is None:
+            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
+        else:
+            pdf_info_dict["_parse_type"] = "ocr"
+    else:
+        pdf_info_dict["_parse_type"] = "txt"
+
+    return pdf_info_dict

+ 0 - 0
utils/config_init_to_json.py → tools/config_init_to_json.py