|
@@ -7,18 +7,17 @@ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
|
|
|
|
|
|
# args
|
|
# args
|
|
|
-pdf_file_name = "demo1.pdf" # replace with the real pdf path
|
|
|
|
|
-name_without_suff = pdf_file_name.split(".")[0]
|
|
|
|
|
|
|
+__dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
+pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf") # replace with the real pdf path
|
|
|
|
|
+name_without_extension = os.path.basename(pdf_file_name).split('.')[0]
|
|
|
|
|
|
|
|
# prepare env
|
|
# prepare env
|
|
|
-local_image_dir, local_md_dir = "output/images", "output"
|
|
|
|
|
|
|
+local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images")
|
|
|
|
|
+local_md_dir = os.path.join(__dir__, "output", name_without_extension)
|
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
image_dir = str(os.path.basename(local_image_dir))
|
|
|
-
|
|
|
|
|
os.makedirs(local_image_dir, exist_ok=True)
|
|
os.makedirs(local_image_dir, exist_ok=True)
|
|
|
|
|
|
|
|
-image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
|
|
|
|
- local_md_dir
|
|
|
|
|
-)
|
|
|
|
|
|
|
+image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
|
|
|
|
|
|
# read bytes
|
|
# read bytes
|
|
|
reader1 = FileBasedDataReader("")
|
|
reader1 = FileBasedDataReader("")
|
|
@@ -41,32 +40,29 @@ else:
|
|
|
## pipeline
|
|
## pipeline
|
|
|
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
|
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
|
|
|
|
|
|
|
-### draw model result on each page
|
|
|
|
|
-infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
|
|
|
|
|
-
|
|
|
|
|
### get model inference result
|
|
### get model inference result
|
|
|
model_inference_result = infer_result.get_infer_res()
|
|
model_inference_result = infer_result.get_infer_res()
|
|
|
|
|
|
|
|
### draw layout result on each page
|
|
### draw layout result on each page
|
|
|
-pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
|
|
|
|
|
|
|
+pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
|
|
|
|
|
|
|
|
### draw spans result on each page
|
|
### draw spans result on each page
|
|
|
-pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
|
|
|
|
|
|
|
+pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf"))
|
|
|
|
|
|
|
|
### get markdown content
|
|
### get markdown content
|
|
|
md_content = pipe_result.get_markdown(image_dir)
|
|
md_content = pipe_result.get_markdown(image_dir)
|
|
|
|
|
|
|
|
### dump markdown
|
|
### dump markdown
|
|
|
-pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
|
|
|
|
|
|
|
+pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir)
|
|
|
|
|
|
|
|
### get content list content
|
|
### get content list content
|
|
|
content_list_content = pipe_result.get_content_list(image_dir)
|
|
content_list_content = pipe_result.get_content_list(image_dir)
|
|
|
|
|
|
|
|
### dump content list
|
|
### dump content list
|
|
|
-pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
|
|
|
|
|
|
|
+pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir)
|
|
|
|
|
|
|
|
### get middle json
|
|
### get middle json
|
|
|
middle_json_content = pipe_result.get_middle_json()
|
|
middle_json_content = pipe_result.get_middle_json()
|
|
|
|
|
|
|
|
### dump middle json
|
|
### dump middle json
|
|
|
-pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
|
|
|
|
|
|
|
+pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
|