|
|
@@ -78,9 +78,9 @@ class AbsPipe(ABC):
|
|
|
pdf_meta["text_layout_per_page"],
|
|
|
)
|
|
|
if is_text_pdf:
|
|
|
- return "txt"
|
|
|
+ return AbsPipe.PIP_TXT
|
|
|
else:
|
|
|
- return "ocr"
|
|
|
+ return AbsPipe.PIP_OCR
|
|
|
|
|
|
@staticmethod
|
|
|
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
|
|
|
@@ -90,9 +90,9 @@ class AbsPipe(ABC):
|
|
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
|
|
parse_type = pdf_mid_data["_parse_type"]
|
|
|
pdf_info_list = pdf_mid_data["pdf_info"]
|
|
|
- if parse_type == "txt":
|
|
|
+ if parse_type == AbsPipe.PIP_TXT:
|
|
|
content_list = mk_universal_format(pdf_info_list, img_buket_path)
|
|
|
- elif parse_type == "ocr":
|
|
|
+ elif parse_type == AbsPipe.PIP_OCR:
|
|
|
content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
|
|
|
return content_list
|
|
|
|
|
|
@@ -104,10 +104,10 @@ class AbsPipe(ABC):
|
|
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
|
|
parse_type = pdf_mid_data["_parse_type"]
|
|
|
pdf_info_list = pdf_mid_data["pdf_info"]
|
|
|
- if parse_type == "txt":
|
|
|
+ if parse_type == AbsPipe.PIP_TXT:
|
|
|
content_list = mk_universal_format(pdf_info_list, img_buket_path)
|
|
|
md_content = mk_mm_markdown(content_list)
|
|
|
- elif parse_type == "ocr":
|
|
|
+ elif parse_type == AbsPipe.PIP_OCR:
|
|
|
md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
|
|
|
return md_content
|
|
|
|