test_cli_sdk.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. """test cli and sdk."""
  2. import logging
  3. import os
  4. import pytest
  5. from conf import conf
  6. from lib import common
  7. import magic_pdf.model as model_config
  8. from magic_pdf.pipe.UNIPipe import UNIPipe
  9. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  10. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  11. model_config.__use_inside_model__ = True
  12. pdf_res_path = conf.conf['pdf_res_path']
  13. code_path = conf.conf['code_path']
  14. pdf_dev_path = conf.conf['pdf_dev_path']
  15. class TestCli:
  16. """test cli."""
  17. @pytest.mark.P0
  18. def test_pdf_auto_sdk(self):
  19. """pdf sdk auto test."""
  20. demo_names = list()
  21. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  22. for pdf_file in os.listdir(pdf_path):
  23. if pdf_file.endswith('.pdf'):
  24. demo_names.append(pdf_file.split('.')[0])
  25. for demo_name in demo_names:
  26. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  27. print(pdf_path)
  28. pdf_bytes = open(pdf_path, 'rb').read()
  29. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  30. image_dir = str(os.path.basename(local_image_dir))
  31. image_writer = DiskReaderWriter(local_image_dir)
  32. model_json = list()
  33. jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
  34. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  35. pipe.pipe_classify()
  36. if len(model_json) == 0:
  37. if model_config.__use_inside_model__:
  38. pipe.pipe_analyze()
  39. else:
  40. exit(1)
  41. pipe.pipe_parse()
  42. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  43. dir_path = os.path.join(pdf_dev_path, 'mineru')
  44. if not os.path.exists(dir_path):
  45. os.makedirs(dir_path, exist_ok=True)
  46. res_path = os.path.join(dir_path, f'{demo_name}.md')
  47. common.delete_file(res_path)
  48. with open(res_path, 'w+', encoding='utf-8') as f:
  49. f.write(md_content)
  50. common.sdk_count_folders_and_check_contents(res_path)
  51. @pytest.mark.P0
  52. def test_pdf_ocr_sdk(self):
  53. """pdf sdk ocr test."""
  54. demo_names = list()
  55. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  56. for pdf_file in os.listdir(pdf_path):
  57. if pdf_file.endswith('.pdf'):
  58. demo_names.append(pdf_file.split('.')[0])
  59. for demo_name in demo_names:
  60. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  61. print(pdf_path)
  62. pdf_bytes = open(pdf_path, 'rb').read()
  63. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  64. image_dir = str(os.path.basename(local_image_dir))
  65. image_writer = DiskReaderWriter(local_image_dir)
  66. model_json = list()
  67. jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
  68. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  69. pipe.pipe_classify()
  70. if len(model_json) == 0:
  71. if model_config.__use_inside_model__:
  72. pipe.pipe_analyze()
  73. else:
  74. exit(1)
  75. pipe.pipe_parse()
  76. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  77. dir_path = os.path.join(pdf_dev_path, 'mineru')
  78. if not os.path.exists(dir_path):
  79. os.makedirs(dir_path, exist_ok=True)
  80. res_path = os.path.join(dir_path, f'{demo_name}.md')
  81. common.delete_file(res_path)
  82. with open(res_path, 'w+', encoding='utf-8') as f:
  83. f.write(md_content)
  84. common.sdk_count_folders_and_check_contents(res_path)
  85. @pytest.mark.P0
  86. def test_pdf_txt_sdk(self):
  87. """pdf sdk txt test."""
  88. demo_names = list()
  89. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  90. for pdf_file in os.listdir(pdf_path):
  91. if pdf_file.endswith('.pdf'):
  92. demo_names.append(pdf_file.split('.')[0])
  93. for demo_name in demo_names:
  94. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  95. print(pdf_path)
  96. pdf_bytes = open(pdf_path, 'rb').read()
  97. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  98. image_dir = str(os.path.basename(local_image_dir))
  99. image_writer = DiskReaderWriter(local_image_dir)
  100. model_json = list()
  101. jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
  102. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  103. pipe.pipe_classify()
  104. if len(model_json) == 0:
  105. if model_config.__use_inside_model__:
  106. pipe.pipe_analyze()
  107. else:
  108. exit(1)
  109. pipe.pipe_parse()
  110. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  111. dir_path = os.path.join(pdf_dev_path, 'mineru')
  112. if not os.path.exists(dir_path):
  113. os.makedirs(dir_path, exist_ok=True)
  114. res_path = os.path.join(dir_path, f'{demo_name}.md')
  115. common.delete_file(res_path)
  116. with open(res_path, 'w+', encoding='utf-8') as f:
  117. f.write(md_content)
  118. common.sdk_count_folders_and_check_contents(res_path)
  119. @pytest.mark.P0
  120. def test_pdf_cli_auto(self):
  121. """magic_pdf cli test auto."""
  122. demo_names = []
  123. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  124. for pdf_file in os.listdir(pdf_path):
  125. if pdf_file.endswith('.pdf'):
  126. demo_names.append(pdf_file.split('.')[0])
  127. for demo_name in demo_names:
  128. res_path = os.path.join(pdf_dev_path, 'mineru')
  129. common.delete_file(res_path)
  130. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  131. pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
  132. logging.info(cmd)
  133. os.system(cmd)
  134. common.cli_count_folders_and_check_contents(
  135. os.path.join(res_path, demo_name, 'auto'))
  136. @pytest.mark.P0
  137. def test_pdf_clit_txt(self):
  138. """magic_pdf cli test txt."""
  139. demo_names = []
  140. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  141. for pdf_file in os.listdir(pdf_path):
  142. if pdf_file.endswith('.pdf'):
  143. demo_names.append(pdf_file.split('.')[0])
  144. for demo_name in demo_names:
  145. res_path = os.path.join(pdf_dev_path, 'mineru')
  146. common.delete_file(res_path)
  147. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  148. pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
  149. logging.info(cmd)
  150. os.system(cmd)
  151. common.cli_count_folders_and_check_contents(
  152. os.path.join(res_path, demo_name, 'txt'))
  153. @pytest.mark.P0
  154. def test_pdf_clit_ocr(self):
  155. """magic_pdf cli test ocr."""
  156. demo_names = []
  157. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  158. for pdf_file in os.listdir(pdf_path):
  159. if pdf_file.endswith('.pdf'):
  160. demo_names.append(pdf_file.split('.')[0])
  161. for demo_name in demo_names:
  162. res_path = os.path.join(pdf_dev_path, 'mineru')
  163. common.delete_file(res_path)
  164. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  165. pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
  166. logging.info(cmd)
  167. os.system(cmd)
  168. common.cli_count_folders_and_check_contents(
  169. os.path.join(res_path, demo_name, 'ocr'))
  170. @pytest.mark.P1
  171. def test_pdf_dev_cli_local_jsonl_txt(self):
  172. """magic_pdf_dev cli local txt."""
  173. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  174. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  175. logging.info(cmd)
  176. os.system(cmd)
  177. @pytest.mark.P1
  178. def test_pdf_dev_cli_local_jsonl_ocr(self):
  179. """magic_pdf_dev cli local ocr."""
  180. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  181. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  182. logging.info(cmd)
  183. os.system(cmd)
  184. @pytest.mark.P1
  185. def test_pdf_dev_cli_local_jsonl_auto(self):
  186. """magic_pdf_dev cli local auto."""
  187. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  188. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  189. logging.info(cmd)
  190. os.system(cmd)
  191. @pytest.mark.P1
  192. def test_pdf_dev_cli_s3_jsonl_txt(self):
  193. """magic_pdf_dev cli s3 txt."""
  194. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  195. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  196. logging.info(cmd)
  197. os.system(cmd)
  198. @pytest.mark.P1
  199. def test_pdf_dev_cli_s3_jsonl_ocr(self):
  200. """magic_pdf_dev cli s3 ocr."""
  201. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  202. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  203. logging.info(cmd)
  204. os.system(cmd)
  205. @pytest.mark.P1
  206. def test_pdf_dev_cli_s3_jsonl_auto(self):
  207. """magic_pdf_dev cli s3 auto."""
  208. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  209. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  210. logging.info(cmd)
  211. os.system(cmd)
  212. @pytest.mark.P1
  213. def test_pdf_dev_cli_pdf_json_auto(self):
  214. """magic_pdf_dev cli pdf+json auto."""
  215. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  216. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
  217. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  218. logging.info(cmd)
  219. os.system(cmd)
  220. @pytest.mark.P1
  221. def test_pdf_dev_cli_pdf_json_ocr(self):
  222. """magic_pdf_dev cli pdf+json ocr."""
  223. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  224. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'research_report_1f978cd81fb7260c8f7644039ec2c054.pdf')
  225. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  226. logging.info(cmd)
  227. os.system(cmd)
  228. @pytest.mark.P1
  229. def test_s3_sdk_suto(self):
  230. pdf_ak = os.environ.get('pdf_ak', "")
  231. pdf_sk = os.environ.get('pdf_sk', "")
  232. pdf_bucket = os.environ.get('bucket', "")
  233. pdf_endpoint = os.environ.get('pdf_endpoint', "")
  234. s3_pdf_path = conf.conf["s3_pdf_path"]
  235. image_dir = "s3://" + pdf_bucket + "/mineru/test/test.md"
  236. s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
  237. s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
  238. pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
  239. jso_useful_key = {"_pdf_type": "", "model_list": []}
  240. pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
  241. pipe.pipe_classify()
  242. pipe.pipe_analyze()
  243. pipe.pipe_parse()
  244. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
  245. assert len(md_content) > 0
  246. if __name__ == '__main__':
  247. pytest.main()