|
|
@@ -10,11 +10,12 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
|
|
|
|
|
|
class UNIPipe(AbsPipe):
|
|
|
|
|
|
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
|
|
|
- super().__init__(pdf_bytes, model_list, image_writer, is_debug)
|
|
|
+ def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
|
|
|
+ self.pdf_type = jso_useful_key["_pdf_type"]
|
|
|
+ super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
|
|
|
|
|
|
def pipe_classify(self):
|
|
|
- self.pdf_type = UNIPipe.classify(self.pdf_bytes)
|
|
|
+ self.pdf_type = AbsPipe.classify(self.pdf_bytes)
|
|
|
|
|
|
def pipe_parse(self):
|
|
|
if self.pdf_type == self.PIP_TXT:
|
|
|
@@ -46,14 +47,21 @@ if __name__ == '__main__':
|
|
|
img_bucket_path = "imgs"
|
|
|
img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
|
|
|
|
|
|
- pipe = UNIPipe(pdf_bytes, model_list, img_writer, img_bucket_path)
|
|
|
+ # pdf_type = UNIPipe.classify(pdf_bytes)
|
|
|
+ # jso_useful_key = {
|
|
|
+ # "_pdf_type": pdf_type,
|
|
|
+ # "model_list": model_list
|
|
|
+ # }
|
|
|
+
|
|
|
+ jso_useful_key = {
|
|
|
+ "_pdf_type": "",
|
|
|
+ "model_list": model_list
|
|
|
+ }
|
|
|
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
|
|
pipe.pipe_classify()
|
|
|
pipe.pipe_parse()
|
|
|
- md_content = pipe.pipe_mk_markdown()
|
|
|
- try:
|
|
|
- content_list = pipe.pipe_mk_uni_format()
|
|
|
- except Exception as e:
|
|
|
- logger.exception(e)
|
|
|
+ md_content = pipe.pipe_mk_markdown(img_bucket_path)
|
|
|
+ content_list = pipe.pipe_mk_uni_format(img_bucket_path)
|
|
|
|
|
|
md_writer = DiskReaderWriter(write_path)
|
|
|
md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
|