api.rst 7.2 KB


  1. Api Usage
  2. ===========
  3. PDF
  4. ----
  5. Local File Example
  6. ^^^^^^^^^^^^^^^^^^
  7. .. code:: python
  8. import os
  9. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  10. from magic_pdf.data.dataset import PymuDocDataset
  11. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  12. from magic_pdf.config.enums import SupportedPdfParseMethod
  13. # args
  14. pdf_file_name = "abc.pdf" # replace with the real pdf path
  15. name_without_suff = pdf_file_name.split(".")[0]
  16. # prepare env
  17. local_image_dir, local_md_dir = "output/images", "output"
  18. image_dir = str(os.path.basename(local_image_dir))
  19. os.makedirs(local_image_dir, exist_ok=True)
  20. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  21. local_md_dir
  22. )
  23. image_dir = str(os.path.basename(local_image_dir))
  24. # read bytes
  25. reader1 = FileBasedDataReader("")
  26. pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
  27. # proc
  28. ## Create Dataset Instance
  29. ds = PymuDocDataset(pdf_bytes)
  30. ## inference
  31. if ds.classify() == SupportedPdfParseMethod.OCR:
  32. infer_result = ds.apply(doc_analyze, ocr=True)
  33. ## pipeline
  34. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  35. else:
  36. infer_result = ds.apply(doc_analyze, ocr=False)
  37. ## pipeline
  38. pipe_result = infer_result.pipe_txt_mode(image_writer)
  39. ### draw model result on each page
  40. infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
  41. ### draw layout result on each page
  42. pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
  43. ### draw spans result on each page
  44. pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
  45. ### dump markdown
  46. pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  47. ### dump content list
  48. pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
  49. S3 File Example
  50. ^^^^^^^^^^^^^^^^
  51. .. code:: python
  52. import os
  53. from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
  54. from magic_pdf.data.dataset import PymuDocDataset
  55. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  56. bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
  57. ak = "{Your S3 access key}" # replace with real s3 access key
  58. sk = "{Your S3 secret key}" # replace with real s3 secret key
  59. endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
  60. reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
  61. writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
  62. image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
  63. # args
  64. pdf_file_name = (
  65. "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
  66. )
  67. # prepare env
  68. local_dir = "output"
  69. name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
  70. # read bytes
  71. pdf_bytes = reader.read(pdf_file_name) # read the pdf content
  72. # proc
  73. ## Create Dataset Instance
  74. ds = PymuDocDataset(pdf_bytes)
  75. ## inference
  76. if ds.classify() == SupportedPdfParseMethod.OCR:
  77. infer_result = ds.apply(doc_analyze, ocr=True)
  78. ## pipeline
  79. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  80. else:
  81. infer_result = ds.apply(doc_analyze, ocr=False)
  82. ## pipeline
  83. pipe_result = infer_result.pipe_txt_mode(image_writer)
  84. ### draw model result on each page
  85. infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
  86. ### draw layout result on each page
  87. pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
  88. ### draw spans result on each page
  89. pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
  90. ### dump markdown
  91. pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  92. ### dump content list
  93. pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
  94. MS-Office
  95. ----------
  96. .. code:: python
  97. import os
  98. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  99. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  100. from magic_pdf.data.read_api import read_local_office
  101. # prepare env
  102. local_image_dir, local_md_dir = "output/images", "output"
  103. image_dir = str(os.path.basename(local_image_dir))
  104. os.makedirs(local_image_dir, exist_ok=True)
  105. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  106. local_md_dir
  107. )
  108. # proc
  109. ## Create Dataset Instance
  110. input_file = "some_ppt.ppt" # replace with real ms-office file
  111. input_file_name = input_file.split(".")[0]
  112. ds = read_local_office(input_file)[0]
  113. ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
  114. md_writer, f"{input_file_name}.md", image_dir
  115. )
  116. This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
  117. Image
  118. ---------
  119. Single Image File
  120. ^^^^^^^^^^^^^^^^^^^
  121. .. code:: python
  122. import os
  123. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  124. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  125. from magic_pdf.data.read_api import read_local_images
  126. # prepare env
  127. local_image_dir, local_md_dir = "output/images", "output"
  128. image_dir = str(os.path.basename(local_image_dir))
  129. os.makedirs(local_image_dir, exist_ok=True)
  130. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  131. local_md_dir
  132. )
  133. # proc
  134. ## Create Dataset Instance
  135. input_file = "some_image.jpg" # replace with real image file
  136. input_file_name = input_file.split(".")[0]
  137. ds = read_local_images(input_file)[0]
  138. ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
  139. md_writer, f"{input_file_name}.md", image_dir
  140. )
  141. Directory That Contains Images
  142. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  143. .. code:: python
  144. import os
  145. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  146. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  147. from magic_pdf.data.read_api import read_local_images
  148. # prepare env
  149. local_image_dir, local_md_dir = "output/images", "output"
  150. image_dir = str(os.path.basename(local_image_dir))
  151. os.makedirs(local_image_dir, exist_ok=True)
  152. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  153. local_md_dir
  154. )
  155. # proc
  156. ## Create Dataset Instance
  157. input_directory = "some_image_dir/" # replace with real directory that contains images
  158. dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
  159. count = 0
  160. for ds in dss:
  161. ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
  162. md_writer, f"{count}.md", image_dir
  163. )
  164. count += 1
  165. Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details