|
|
@@ -5,6 +5,7 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData
|
|
|
from magic_pdf.data.dataset import PymuDocDataset
|
|
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
|
+from magic_pdf.config.make_content_config import DropMode, MakeMode
|
|
|
|
|
|
# args
|
|
|
pdf_file_name = "demo1.pdf" # replace with the real pdf path
|
|
|
@@ -19,7 +20,6 @@ os.makedirs(local_image_dir, exist_ok=True)
|
|
|
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
|
|
local_md_dir
|
|
|
)
|
|
|
-image_dir = str(os.path.basename(local_image_dir))
|
|
|
|
|
|
# read bytes
|
|
|
reader1 = FileBasedDataReader("")
|
|
|
@@ -45,6 +45,9 @@ else:
|
|
|
### draw model result on each page
|
|
|
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
|
|
|
|
|
|
+### get model inference result
|
|
|
+model_inference_result = infer_result.get_infer_res()
|
|
|
+
|
|
|
### draw layout result on each page
|
|
|
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
|
|
|
|
|
|
@@ -55,4 +58,13 @@ pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf
|
|
|
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
|
|
|
|
|
|
### dump content list
|
|
|
-pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
|
|
|
+pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
|
|
|
+
|
|
|
+### get markdown content
|
|
|
+md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
|
|
|
+
|
|
|
+### get content list content
|
|
|
+content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
|
|
|
+
|
|
|
+### get middle json
|
|
|
+middle_json_content = pipe_result.get_middle_json()
|