api.rst 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. Api Usage
  2. ===========
  3. PDF
  4. ----
  5. Local File Example
  6. ^^^^^^^^^^^^^^^^^^
  7. .. code:: python
  8. import os
  9. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  10. from magic_pdf.data.dataset import PymuDocDataset
  11. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  12. # args
  13. pdf_file_name = "abc.pdf" # replace with the real pdf path
  14. name_without_suff = pdf_file_name.split(".")[0]
  15. # prepare env
  16. local_image_dir, local_md_dir = "output/images", "output"
  17. image_dir = str(os.path.basename(local_image_dir))
  18. os.makedirs(local_image_dir, exist_ok=True)
  19. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  20. local_md_dir
  21. )
  22. image_dir = str(os.path.basename(local_image_dir))
  23. # read bytes
  24. reader1 = FileBasedDataReader("")
  25. pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
  26. # proc
  27. ## Create Dataset Instance
  28. ds = PymuDocDataset(pdf_bytes)
  29. ## inference
  30. infer_result = ds.apply(doc_analyze, ocr=True)
  31. ### draw model result on each page
  32. infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
  33. ## pipeline
  34. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  35. ### draw layout result on each page
  36. pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
  37. ### draw spans result on each page
  38. pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
  39. ### dump markdown
  40. pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  41. S3 File Example
  42. ^^^^^^^^^^^^^^^^
  43. .. code:: python
  44. import os
  45. from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
  46. from magic_pdf.data.dataset import PymuDocDataset
  47. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  48. bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
  49. ak = "{Your S3 access key}" # replace with real s3 access key
  50. sk = "{Your S3 secret key}" # replace with real s3 secret key
  51. endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
  52. reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
  53. writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
  54. image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
  55. # args
  56. pdf_file_name = (
  57. "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
  58. )
  59. # prepare env
  60. local_dir = "output"
  61. name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
  62. # read bytes
  63. pdf_bytes = reader.read(pdf_file_name) # read the pdf content
  64. # proc
  65. ## Create Dataset Instance
  66. ds = PymuDocDataset(pdf_bytes)
  67. ## inference
  68. infer_result = ds.apply(doc_analyze, ocr=True)
  69. ### draw model result on each page
  70. infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
  71. ## pipeline
  72. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  73. ### draw layout result on each page
  74. pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
  75. ### draw spans result on each page
  76. pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
  77. ### dump markdown
  78. pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
  79. MS-Office
  80. ----------
  81. .. code:: python
  82. import os
  83. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  84. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  85. from magic_pdf.data.read_api import read_local_office
  86. # prepare env
  87. local_image_dir, local_md_dir = "output/images", "output"
  88. image_dir = str(os.path.basename(local_image_dir))
  89. os.makedirs(local_image_dir, exist_ok=True)
  90. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  91. local_md_dir
  92. )
  93. # proc
  94. ## Create Dataset Instance
  95. input_file = "some_ppt.ppt" # replace with real ms-office file
  96. input_file_name = input_file.split(".")[0]
  97. ds = read_local_office(input_file)[0]
  98. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
  99. md_writer, f"{input_file_name}.md", image_dir
  100. )
  101. This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
  102. Image
  103. ---------
  104. Single Image File
  105. ^^^^^^^^^^^^^^^^^^^
  106. .. code:: python
  107. import os
  108. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  109. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  110. from magic_pdf.data.read_api import read_local_images
  111. # prepare env
  112. local_image_dir, local_md_dir = "output/images", "output"
  113. image_dir = str(os.path.basename(local_image_dir))
  114. os.makedirs(local_image_dir, exist_ok=True)
  115. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  116. local_md_dir
  117. )
  118. # proc
  119. ## Create Dataset Instance
  120. input_file = "some_image.jpg" # replace with real image file
  121. input_file_name = input_file.split(".")[0]
  122. ds = read_local_images(input_file)[0]
  123. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
  124. md_writer, f"{input_file_name}.md", image_dir
  125. )
  126. Directory That Contains Images
  127. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  128. .. code:: python
  129. import os
  130. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  131. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  132. from magic_pdf.data.read_api import read_local_images
  133. # prepare env
  134. local_image_dir, local_md_dir = "output/images", "output"
  135. image_dir = str(os.path.basename(local_image_dir))
  136. os.makedirs(local_image_dir, exist_ok=True)
  137. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
  138. local_md_dir
  139. )
  140. # proc
  141. ## Create Dataset Instance
  142. input_directory = "some_image_dir/" # replace with real directory that contains images
  143. dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]
  144. count = 0
  145. for ds in dss:
  146. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
  147. md_writer, f"{count}.md", image_dir
  148. )
  149. count += 1
  150. Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details