|
|
@@ -42,6 +42,17 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
default='pipeline',
|
|
|
)
|
|
|
@click.option(
|
|
|
+ '-l',
|
|
|
+ '--lang',
|
|
|
+ 'lang',
|
|
|
+ type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']),
|
|
|
+ help="""
|
|
|
+ Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
|
|
|
+ Without languages specified, 'ch' will be used by default.
|
|
|
+ """,
|
|
|
+ default='ch',
|
|
|
+)
|
|
|
+@click.option(
|
|
|
'-u',
|
|
|
'--url',
|
|
|
'server_url',
|
|
|
@@ -68,24 +79,33 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
|
default=None,
|
|
|
)
|
|
|
|
|
|
-def main(input_path, output_dir, backend, server_url, start_page_id, end_page_id):
|
|
|
+def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_page_id):
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
- def parse_doc(path: Path):
|
|
|
+ def parse_doc(path_list: list[Path]):
|
|
|
try:
|
|
|
- file_name = str(Path(path).stem)
|
|
|
- pdf_bits = read_fn(path)
|
|
|
- do_parse(output_dir, file_name, pdf_bits, backend, server_url,
|
|
|
- start_page_id=start_page_id, end_page_id=end_page_id)
|
|
|
+ file_name_list = []
|
|
|
+ pdf_bytes_list = []
|
|
|
+ lang_list = []
|
|
|
+ for path in path_list:
|
|
|
+ file_name = str(Path(path).stem)
|
|
|
+ pdf_bytes = read_fn(path)
|
|
|
+ file_name_list.append(file_name)
|
|
|
+ pdf_bytes_list.append(pdf_bytes)
|
|
|
+ lang_list.append(lang)
|
|
|
+ do_parse(output_dir, file_name_list, pdf_bytes_list, lang_list, backend, server_url,
|
|
|
+ start_page_id=start_page_id, end_page_id=end_page_id)
|
|
|
except Exception as e:
|
|
|
logger.exception(e)
|
|
|
|
|
|
if os.path.isdir(input_path):
|
|
|
+ doc_path_list = []
|
|
|
for doc_path in Path(input_path).glob('*'):
|
|
|
if doc_path.suffix in pdf_suffixes + image_suffixes:
|
|
|
- parse_doc(Path(doc_path))
|
|
|
+ doc_path_list.append(doc_path)
|
|
|
+ parse_doc(doc_path_list)
|
|
|
else:
|
|
|
- parse_doc(Path(input_path))
|
|
|
+ parse_doc([Path(input_path)])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|