|
|
@@ -164,21 +164,21 @@ async def pdf_parse(
|
|
|
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
|
|
|
to the specified directory.
|
|
|
|
|
|
- :param pdf_file: The PDF file to be parsed. Must not be specified together with
|
|
|
- `pdf_path`
|
|
|
- :param pdf_path: The path to the PDF file to be parsed. Must not be specified
|
|
|
- together with `pdf_file`
|
|
|
- :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
|
|
|
- results are not satisfactory, try ocr
|
|
|
- :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default
|
|
|
- to False. Different stages of data will be written to different .json files (3
|
|
|
- in total), md content will be saved to .md file
|
|
|
- :param output_dir: Output directory for results. A folder named after the PDF file
|
|
|
- will be created to store all results
|
|
|
- :param return_layout: Whether to return parsed PDF layout. Default to False
|
|
|
- :param return_info: Whether to return parsed PDF info. Default to False
|
|
|
- :param return_content_list: Whether to return parsed PDF content list. Default to
|
|
|
- False
|
|
|
+ Args:
|
|
|
+ pdf_file: The PDF file to be parsed. Must not be specified together with
|
|
|
+ `pdf_path`
|
|
|
+ pdf_path: The path to the PDF file to be parsed. Must not be specified together
|
|
|
+ with `pdf_file`
|
|
|
+ parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
|
|
|
+ results are not satisfactory, try ocr
|
|
|
+ is_json_md_dump: Whether to write parsed data to .json and .md files. Default
|
|
|
+ to False. Different stages of data will be written to different .json files
|
|
|
+ (3 in total), md content will be saved to .md file
|
|
|
+ output_dir: Output directory for results. A folder named after the PDF file
|
|
|
+ will be created to store all results
|
|
|
+ return_layout: Whether to return parsed PDF layout. Default to False
|
|
|
+ return_info: Whether to return parsed PDF info. Default to False
|
|
|
+ return_content_list: Whether to return parsed PDF content list. Default to False
|
|
|
"""
|
|
|
try:
|
|
|
if (pdf_file is None and pdf_path is None) or (
|