test_cli_sdk.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. """test cli and sdk."""
  2. import logging
  3. import os
  4. import pytest
  5. from conf import conf
  6. from lib import common
  7. import magic_pdf.model as model_config
  8. from magic_pdf.pipe.UNIPipe import UNIPipe
  9. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  10. model_config.__use_inside_model__ = True
  11. pdf_res_path = conf.conf['pdf_res_path']
  12. code_path = conf.conf['code_path']
  13. pdf_dev_path = conf.conf['pdf_dev_path']
  14. class TestCli:
  15. """test cli."""
  16. @pytest.mark.P0
  17. def test_pdf_auto_sdk(self):
  18. """pdf sdk auto test."""
  19. demo_names = list()
  20. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  21. for pdf_file in os.listdir(pdf_path):
  22. if pdf_file.endswith('.pdf'):
  23. demo_names.append(pdf_file.split('.')[0])
  24. for demo_name in demo_names:
  25. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  26. print(pdf_path)
  27. pdf_bytes = open(pdf_path, 'rb').read()
  28. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  29. image_dir = str(os.path.basename(local_image_dir))
  30. image_writer = DiskReaderWriter(local_image_dir)
  31. model_json = list()
  32. jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
  33. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  34. pipe.pipe_classify()
  35. if len(model_json) == 0:
  36. if model_config.__use_inside_model__:
  37. pipe.pipe_analyze()
  38. else:
  39. exit(1)
  40. pipe.pipe_parse()
  41. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  42. dir_path = os.path.join(pdf_dev_path, 'mineru')
  43. if not os.path.exists(dir_path):
  44. os.makedirs(dir_path, exist_ok=True)
  45. res_path = os.path.join(dir_path, f'{demo_name}.md')
  46. common.delete_file(res_path)
  47. with open(res_path, 'w+', encoding='utf-8') as f:
  48. f.write(md_content)
  49. common.sdk_count_folders_and_check_contents(res_path)
  50. @pytest.mark.P0
  51. def test_pdf_ocr_sdk(self):
  52. """pdf sdk ocr test."""
  53. demo_names = list()
  54. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  55. for pdf_file in os.listdir(pdf_path):
  56. if pdf_file.endswith('.pdf'):
  57. demo_names.append(pdf_file.split('.')[0])
  58. for demo_name in demo_names:
  59. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  60. print(pdf_path)
  61. pdf_bytes = open(pdf_path, 'rb').read()
  62. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  63. image_dir = str(os.path.basename(local_image_dir))
  64. image_writer = DiskReaderWriter(local_image_dir)
  65. model_json = list()
  66. jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
  67. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  68. pipe.pipe_classify()
  69. if len(model_json) == 0:
  70. if model_config.__use_inside_model__:
  71. pipe.pipe_analyze()
  72. else:
  73. exit(1)
  74. pipe.pipe_parse()
  75. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  76. dir_path = os.path.join(pdf_dev_path, 'mineru')
  77. if not os.path.exists(dir_path):
  78. os.makedirs(dir_path, exist_ok=True)
  79. res_path = os.path.join(dir_path, f'{demo_name}.md')
  80. common.delete_file(res_path)
  81. with open(res_path, 'w+', encoding='utf-8') as f:
  82. f.write(md_content)
  83. common.sdk_count_folders_and_check_contents(res_path)
  84. @pytest.mark.P0
  85. def test_pdf_txt_sdk(self):
  86. """pdf sdk txt test."""
  87. demo_names = list()
  88. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  89. for pdf_file in os.listdir(pdf_path):
  90. if pdf_file.endswith('.pdf'):
  91. demo_names.append(pdf_file.split('.')[0])
  92. for demo_name in demo_names:
  93. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  94. print(pdf_path)
  95. pdf_bytes = open(pdf_path, 'rb').read()
  96. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  97. image_dir = str(os.path.basename(local_image_dir))
  98. image_writer = DiskReaderWriter(local_image_dir)
  99. model_json = list()
  100. jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
  101. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  102. pipe.pipe_classify()
  103. if len(model_json) == 0:
  104. if model_config.__use_inside_model__:
  105. pipe.pipe_analyze()
  106. else:
  107. exit(1)
  108. pipe.pipe_parse()
  109. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  110. dir_path = os.path.join(pdf_dev_path, 'mineru')
  111. if not os.path.exists(dir_path):
  112. os.makedirs(dir_path, exist_ok=True)
  113. res_path = os.path.join(dir_path, f'{demo_name}.md')
  114. common.delete_file(res_path)
  115. with open(res_path, 'w+', encoding='utf-8') as f:
  116. f.write(md_content)
  117. common.sdk_count_folders_and_check_contents(res_path)
  118. @pytest.mark.P0
  119. def test_pdf_cli_auto(self):
  120. """magic_pdf cli test auto."""
  121. demo_names = []
  122. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  123. for pdf_file in os.listdir(pdf_path):
  124. if pdf_file.endswith('.pdf'):
  125. demo_names.append(pdf_file.split('.')[0])
  126. for demo_name in demo_names:
  127. res_path = os.path.join(pdf_dev_path, 'mineru')
  128. common.delete_file(res_path)
  129. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  130. pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
  131. logging.info(cmd)
  132. os.system(cmd)
  133. common.cli_count_folders_and_check_contents(
  134. os.path.join(res_path, demo_name, 'auto'))
  135. @pytest.mark.P0
  136. def test_pdf_clit_txt(self):
  137. """magic_pdf cli test txt."""
  138. demo_names = []
  139. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  140. for pdf_file in os.listdir(pdf_path):
  141. if pdf_file.endswith('.pdf'):
  142. demo_names.append(pdf_file.split('.')[0])
  143. for demo_name in demo_names:
  144. res_path = os.path.join(pdf_dev_path, 'mineru')
  145. common.delete_file(res_path)
  146. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  147. pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
  148. logging.info(cmd)
  149. os.system(cmd)
  150. common.cli_count_folders_and_check_contents(
  151. os.path.join(res_path, demo_name, 'txt'))
  152. @pytest.mark.P0
  153. def test_pdf_clit_ocr(self):
  154. """magic_pdf cli test ocr."""
  155. demo_names = []
  156. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  157. for pdf_file in os.listdir(pdf_path):
  158. if pdf_file.endswith('.pdf'):
  159. demo_names.append(pdf_file.split('.')[0])
  160. for demo_name in demo_names:
  161. res_path = os.path.join(pdf_dev_path, 'mineru')
  162. common.delete_file(res_path)
  163. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  164. pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
  165. logging.info(cmd)
  166. os.system(cmd)
  167. common.cli_count_folders_and_check_contents(
  168. os.path.join(res_path, demo_name, 'ocr'))
  169. @pytest.mark.P1
  170. def test_pdf_dev_cli_local_jsonl_txt(self):
  171. """magic_pdf_dev cli local txt."""
  172. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  173. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  174. logging.info(cmd)
  175. os.system(cmd)
  176. @pytest.mark.P1
  177. def test_pdf_dev_cli_local_jsonl_ocr(self):
  178. """magic_pdf_dev cli local ocr."""
  179. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  180. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  181. logging.info(cmd)
  182. os.system(cmd)
  183. @pytest.mark.P1
  184. def test_pdf_dev_cli_local_jsonl_auto(self):
  185. """magic_pdf_dev cli local auto."""
  186. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  187. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  188. logging.info(cmd)
  189. os.system(cmd)
  190. @pytest.mark.P1
  191. def test_pdf_dev_cli_s3_jsonl_txt(self):
  192. """magic_pdf_dev cli s3 txt."""
  193. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  194. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  195. logging.info(cmd)
  196. os.system(cmd)
  197. @pytest.mark.P1
  198. def test_pdf_dev_cli_s3_jsonl_ocr(self):
  199. """magic_pdf_dev cli s3 ocr."""
  200. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  201. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  202. logging.info(cmd)
  203. os.system(cmd)
  204. @pytest.mark.P1
  205. def test_pdf_dev_cli_s3_jsonl_auto(self):
  206. """magic_pdf_dev cli s3 auto."""
  207. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  208. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  209. logging.info(cmd)
  210. os.system(cmd)
  211. @pytest.mark.P1
  212. def test_pdf_dev_cli_pdf_json_auto(self):
  213. """magic_pdf_dev cli pdf+json auto."""
  214. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  215. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
  216. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  217. logging.info(cmd)
  218. os.system(cmd)
  219. @pytest.mark.P1
  220. def test_pdf_dev_cli_pdf_json_ocr(self):
  221. """magic_pdf_dev cli pdf+json ocr."""
  222. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  223. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
  224. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  225. logging.info(cmd)
  226. os.system(cmd)
  227. if __name__ == '__main__':
  228. pytest.main()