test_cli_sdk.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. """test cli and sdk."""
  2. import logging
  3. import os
  4. import pytest
  5. from conf import conf
  6. from lib import common
  7. import time
  8. import magic_pdf.model as model_config
  9. from magic_pdf.pipe.UNIPipe import UNIPipe
  10. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  11. from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
  12. model_config.__use_inside_model__ = True
  13. pdf_res_path = conf.conf['pdf_res_path']
  14. code_path = conf.conf['code_path']
  15. pdf_dev_path = conf.conf['pdf_dev_path']
  16. class TestCli:
  17. """test cli."""
  18. @pytest.mark.P0
  19. def test_pdf_auto_sdk(self):
  20. """pdf sdk auto test."""
  21. demo_names = list()
  22. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  23. for pdf_file in os.listdir(pdf_path):
  24. if pdf_file.endswith('.pdf'):
  25. demo_names.append(pdf_file.split('.')[0])
  26. for demo_name in demo_names:
  27. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  28. print(pdf_path)
  29. pdf_bytes = open(pdf_path, 'rb').read()
  30. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  31. image_dir = str(os.path.basename(local_image_dir))
  32. image_writer = DiskReaderWriter(local_image_dir)
  33. model_json = list()
  34. jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
  35. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  36. pipe.pipe_classify()
  37. if len(model_json) == 0:
  38. if model_config.__use_inside_model__:
  39. pipe.pipe_analyze()
  40. else:
  41. exit(1)
  42. pipe.pipe_parse()
  43. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  44. dir_path = os.path.join(pdf_dev_path, 'mineru')
  45. if not os.path.exists(dir_path):
  46. os.makedirs(dir_path, exist_ok=True)
  47. res_path = os.path.join(dir_path, f'{demo_name}.md')
  48. common.delete_file(res_path)
  49. with open(res_path, 'w+', encoding='utf-8') as f:
  50. f.write(md_content)
  51. common.sdk_count_folders_and_check_contents(res_path)
  52. @pytest.mark.P0
  53. def test_pdf_ocr_sdk(self):
  54. """pdf sdk ocr test."""
  55. time.sleep(2)
  56. demo_names = list()
  57. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  58. for pdf_file in os.listdir(pdf_path):
  59. if pdf_file.endswith('.pdf'):
  60. demo_names.append(pdf_file.split('.')[0])
  61. for demo_name in demo_names:
  62. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  63. print(pdf_path)
  64. pdf_bytes = open(pdf_path, 'rb').read()
  65. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  66. image_dir = str(os.path.basename(local_image_dir))
  67. image_writer = DiskReaderWriter(local_image_dir)
  68. model_json = list()
  69. jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
  70. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  71. pipe.pipe_classify()
  72. if len(model_json) == 0:
  73. if model_config.__use_inside_model__:
  74. pipe.pipe_analyze()
  75. else:
  76. exit(1)
  77. pipe.pipe_parse()
  78. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  79. dir_path = os.path.join(pdf_dev_path, 'mineru')
  80. if not os.path.exists(dir_path):
  81. os.makedirs(dir_path, exist_ok=True)
  82. res_path = os.path.join(dir_path, f'{demo_name}.md')
  83. common.delete_file(res_path)
  84. with open(res_path, 'w+', encoding='utf-8') as f:
  85. f.write(md_content)
  86. common.sdk_count_folders_and_check_contents(res_path)
  87. @pytest.mark.P0
  88. def test_pdf_txt_sdk(self):
  89. """pdf sdk txt test."""
  90. time.sleep(2)
  91. demo_names = list()
  92. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  93. for pdf_file in os.listdir(pdf_path):
  94. if pdf_file.endswith('.pdf'):
  95. demo_names.append(pdf_file.split('.')[0])
  96. for demo_name in demo_names:
  97. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  98. pdf_bytes = open(pdf_path, 'rb').read()
  99. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  100. image_dir = str(os.path.basename(local_image_dir))
  101. image_writer = DiskReaderWriter(local_image_dir)
  102. model_json = list()
  103. jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
  104. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  105. pipe.pipe_classify()
  106. if len(model_json) == 0:
  107. if model_config.__use_inside_model__:
  108. pipe.pipe_analyze()
  109. else:
  110. exit(1)
  111. pipe.pipe_parse()
  112. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
  113. dir_path = os.path.join(pdf_dev_path, 'mineru')
  114. if not os.path.exists(dir_path):
  115. os.makedirs(dir_path, exist_ok=True)
  116. res_path = os.path.join(dir_path, f'{demo_name}.md')
  117. common.delete_file(res_path)
  118. with open(res_path, 'w+', encoding='utf-8') as f:
  119. f.write(md_content)
  120. common.sdk_count_folders_and_check_contents(res_path)
  121. @pytest.mark.P0
  122. def test_pdf_cli_auto(self):
  123. """magic_pdf cli test auto."""
  124. time.sleep(2)
  125. demo_names = []
  126. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  127. for pdf_file in os.listdir(pdf_path):
  128. if pdf_file.endswith('.pdf'):
  129. demo_names.append(pdf_file.split('.')[0])
  130. for demo_name in demo_names:
  131. res_path = os.path.join(pdf_dev_path, 'mineru')
  132. common.delete_file(res_path)
  133. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  134. pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
  135. logging.info(cmd)
  136. os.system(cmd)
  137. common.cli_count_folders_and_check_contents(
  138. os.path.join(res_path, demo_name, 'auto'))
  139. @pytest.mark.P0
  140. def test_pdf_cli_txt(self):
  141. """magic_pdf cli test txt."""
  142. time.sleep(2)
  143. demo_names = []
  144. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  145. for pdf_file in os.listdir(pdf_path):
  146. if pdf_file.endswith('.pdf'):
  147. demo_names.append(pdf_file.split('.')[0])
  148. for demo_name in demo_names:
  149. res_path = os.path.join(pdf_dev_path, 'mineru')
  150. common.delete_file(res_path)
  151. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  152. pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
  153. logging.info(cmd)
  154. os.system(cmd)
  155. common.cli_count_folders_and_check_contents(
  156. os.path.join(res_path, demo_name, 'txt'))
  157. @pytest.mark.P0
  158. def test_pdf_cli_ocr(self):
  159. """magic_pdf cli test ocr."""
  160. time.sleep(2)
  161. demo_names = []
  162. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  163. for pdf_file in os.listdir(pdf_path):
  164. if pdf_file.endswith('.pdf'):
  165. demo_names.append(pdf_file.split('.')[0])
  166. for demo_name in demo_names:
  167. res_path = os.path.join(pdf_dev_path, 'mineru')
  168. common.delete_file(res_path)
  169. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  170. pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
  171. logging.info(cmd)
  172. os.system(cmd)
  173. common.cli_count_folders_and_check_contents(
  174. os.path.join(res_path, demo_name, 'ocr'))
  175. @pytest.mark.skip(reason='out-of-date api')
  176. @pytest.mark.P1
  177. def test_pdf_dev_cli_local_jsonl_txt(self):
  178. """magic_pdf_dev cli local txt."""
  179. time.sleep(2)
  180. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  181. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  182. logging.info(cmd)
  183. os.system(cmd)
  184. @pytest.mark.skip(reason='out-of-date api')
  185. @pytest.mark.P1
  186. def test_pdf_dev_cli_local_jsonl_ocr(self):
  187. """magic_pdf_dev cli local ocr."""
  188. time.sleep(2)
  189. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  190. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  191. logging.info(cmd)
  192. os.system(cmd)
  193. @pytest.mark.skip(reason='out-of-date api')
  194. @pytest.mark.P1
  195. def test_pdf_dev_cli_local_jsonl_auto(self):
  196. """magic_pdf_dev cli local auto."""
  197. time.sleep(2)
  198. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  199. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  200. logging.info(cmd)
  201. os.system(cmd)
  202. @pytest.mark.skip(reason='out-of-date api')
  203. @pytest.mark.P1
  204. def test_pdf_dev_cli_s3_jsonl_txt(self):
  205. """magic_pdf_dev cli s3 txt."""
  206. time.sleep(2)
  207. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  208. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  209. logging.info(cmd)
  210. os.system(cmd)
  211. @pytest.mark.skip(reason='out-of-date api')
  212. @pytest.mark.P1
  213. def test_pdf_dev_cli_s3_jsonl_ocr(self):
  214. """magic_pdf_dev cli s3 ocr."""
  215. time.sleep(2)
  216. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  217. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  218. logging.info(cmd)
  219. os.system(cmd)
  220. @pytest.mark.skip(reason='out-of-date api')
  221. @pytest.mark.P1
  222. def test_pdf_dev_cli_s3_jsonl_auto(self):
  223. """magic_pdf_dev cli s3 auto."""
  224. time.sleep(2)
  225. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  226. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  227. logging.info(cmd)
  228. os.system(cmd)
  229. @pytest.mark.P1
  230. def test_pdf_dev_cli_pdf_json_auto(self):
  231. """magic_pdf_dev cli pdf+json auto."""
  232. time.sleep(2)
  233. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  234. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
  235. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  236. logging.info(cmd)
  237. os.system(cmd)
  238. @pytest.mark.skip(reason='out-of-date api')
  239. @pytest.mark.P1
  240. def test_pdf_dev_cli_pdf_json_ocr(self):
  241. """magic_pdf_dev cli pdf+json ocr."""
  242. time.sleep(2)
  243. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  244. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
  245. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  246. logging.info(cmd)
  247. os.system(cmd)
  248. @pytest.mark.P1
  249. def test_s3_sdk_suto(self):
  250. """
  251. test s3 sdk auto.
  252. """
  253. time.sleep(2)
  254. pdf_ak = os.getenv('pdf_ak')
  255. print (pdf_ak)
  256. pdf_sk = os.environ.get('pdf_sk', "")
  257. pdf_bucket = os.environ.get('bucket', "")
  258. pdf_endpoint = os.environ.get('pdf_endpoint', "")
  259. s3_pdf_path = conf.conf["s3_pdf_path"]
  260. image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
  261. print (image_dir)
  262. s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
  263. s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
  264. pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
  265. jso_useful_key = {"_pdf_type": "", "model_list": []}
  266. pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
  267. pipe.pipe_classify()
  268. pipe.pipe_analyze()
  269. pipe.pipe_parse()
  270. md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
  271. assert len(md_content) > 0
  272. @pytest.mark.P1
  273. def test_local_magic_pdf_open_st_table(self):
  274. """magic pdf cli open st table."""
  275. time.sleep(2)
  276. pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
  277. print (pre_cmd)
  278. os.system(pre_cmd)
  279. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  280. common.delete_file(pdf_res_path)
  281. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  282. os.system(cli_cmd)
  283. res = common.check_latex_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  284. assert res is True
  285. @pytest.mark.P1
  286. def test_local_magic_pdf_open_html_table(self):
  287. """magic pdf cli open html table."""
  288. time.sleep(2)
  289. pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
  290. os.system(pre_cmd)
  291. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  292. common.delete_file(pdf_res_path)
  293. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  294. os.system(cli_cmd)
  295. res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  296. assert res is True
  297. @pytest.mark.P1
  298. def test_magic_pdf_close_html_table_cpu(self):
  299. """magic pdf cli close html table cpu mode."""
  300. time.sleep(2)
  301. pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
  302. os.system(pre_cmd)
  303. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  304. common.delete_file(pdf_res_path)
  305. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  306. os.system(cli_cmd)
  307. res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  308. assert res is True
  309. @pytest.mark.P1
  310. def test_local_magic_pdf_close_html_table(self):
  311. """magic pdf cli close table."""
  312. time.sleep(2)
  313. pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
  314. os.system(pre_cmd)
  315. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  316. common.delete_file(pdf_res_path)
  317. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  318. os.system(cli_cmd)
  319. res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  320. assert res is True
  321. if __name__ == '__main__':
  322. pytest.main()