|
@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
|
|
|
class UNIPipe(AbsPipe):
|
|
class UNIPipe(AbsPipe):
|
|
|
|
|
|
|
|
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
|
|
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
|
|
|
- start_page_id=0, end_page_id=None):
|
|
|
|
|
|
|
+ start_page_id=0, end_page_id=None, lang=None):
|
|
|
self.pdf_type = jso_useful_key["_pdf_type"]
|
|
self.pdf_type = jso_useful_key["_pdf_type"]
|
|
|
- super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
|
|
|
|
|
|
|
+ super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
|
|
|
if len(self.model_list) == 0:
|
|
if len(self.model_list) == 0:
|
|
|
self.input_model_is_empty = True
|
|
self.input_model_is_empty = True
|
|
|
else:
|
|
else:
|
|
@@ -28,16 +28,19 @@ class UNIPipe(AbsPipe):
|
|
|
def pipe_analyze(self):
|
|
def pipe_analyze(self):
|
|
|
if self.pdf_type == self.PIP_TXT:
|
|
if self.pdf_type == self.PIP_TXT:
|
|
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
|
|
- start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
|
|
|
|
|
|
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
|
|
|
|
+ lang=self.lang)
|
|
|
elif self.pdf_type == self.PIP_OCR:
|
|
elif self.pdf_type == self.PIP_OCR:
|
|
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
|
|
- start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
|
|
|
|
|
|
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
|
|
|
|
+ lang=self.lang)
|
|
|
|
|
|
|
|
def pipe_parse(self):
|
|
def pipe_parse(self):
|
|
|
if self.pdf_type == self.PIP_TXT:
|
|
if self.pdf_type == self.PIP_TXT:
|
|
|
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
|
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
|
|
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
|
|
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
|
|
|
- start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
|
|
|
|
|
|
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
|
|
|
|
+ lang=self.lang)
|
|
|
elif self.pdf_type == self.PIP_OCR:
|
|
elif self.pdf_type == self.PIP_OCR:
|
|
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
|
|
is_debug=self.is_debug,
|
|
is_debug=self.is_debug,
|