test_cli_sdk.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. """test cli and sdk."""
  2. import logging
  3. import os
  4. import pytest
  5. from conf import conf
  6. from lib import common
  7. import time
  8. import magic_pdf.model as model_config
  9. from magic_pdf.data.read_api import read_local_images
  10. from magic_pdf.pipe.UNIPipe import UNIPipe
  11. from magic_pdf.data.read_api import read_local_office
  12. from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
  13. from magic_pdf.config.make_content_config import DropMode, MakeMode
  14. from magic_pdf.pipe.OCRPipe import OCRPipe
  15. from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  16. from magic_pdf.data.dataset import PymuDocDataset
  17. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  18. from magic_pdf.config.enums import SupportedPdfParseMethod
  19. model_config.__use_inside_model__ = True
  20. pdf_res_path = conf.conf['pdf_res_path']
  21. code_path = conf.conf['code_path']
  22. pdf_dev_path = conf.conf['pdf_dev_path']
  23. magic_pdf_config = "/home/quyuan/magic-pdf.json"
  24. class TestCli:
  25. """test cli."""
  26. @pytest.fixture(autouse=True)
  27. def setup(self):
  28. """
  29. init
  30. """
  31. common.clear_gpu_memory()
  32. common.update_config_file(magic_pdf_config, "device-mode", "cuda")
  33. # 这里可以添加任何前置操作
  34. yield
  35. @pytest.mark.P0
  36. def test_pdf_local_sdk(self):
  37. """pdf sdk auto test."""
  38. demo_names = list()
  39. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  40. for pdf_file in os.listdir(pdf_path):
  41. if pdf_file.endswith('.pdf'):
  42. demo_names.append(pdf_file.split('.')[0])
  43. for demo_name in demo_names:
  44. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  45. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  46. image_dir = str(os.path.basename(local_image_dir))
  47. name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
  48. dir_path = os.path.join(pdf_dev_path, 'mineru')
  49. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  50. reader1 = FileBasedDataReader("")
  51. pdf_bytes = reader1.read(pdf_path)
  52. ds = PymuDocDataset(pdf_bytes)
  53. ## inference
  54. if ds.classify() == SupportedPdfParseMethod.OCR:
  55. infer_result = ds.apply(doc_analyze, ocr=True)
  56. ## pipeline
  57. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  58. else:
  59. infer_result = ds.apply(doc_analyze, ocr=False)
  60. ## pipeline
  61. pipe_result = infer_result.pipe_txt_mode(image_writer)
  62. common.delete_file(dir_path)
  63. ### draw model result on each page
  64. infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
  65. ### get model inference result
  66. model_inference_result = infer_result.get_infer_res()
  67. ### draw layout result on each page
  68. pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
  69. ### draw spans result on each page
  70. pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
  71. ### dump markdown
  72. pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  73. ### dump content list
  74. pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
  75. ### get markdown content
  76. md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
  77. ### get content list content
  78. content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
  79. ### get middle json
  80. middle_json_content = pipe_result.get_middle_json()
  81. common.sdk_count_folders_and_check_contents(dir_path)
  82. @pytest.mark.P0
  83. def test_pdf_s3_sdk(self):
  84. """pdf s3 sdk test."""
  85. demo_names = list()
  86. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  87. for pdf_file in os.listdir(pdf_path):
  88. if pdf_file.endswith('.pdf'):
  89. demo_names.append(pdf_file.split('.')[0])
  90. for demo_name in demo_names:
  91. pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
  92. local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
  93. image_dir = str(os.path.basename(local_image_dir))
  94. name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
  95. dir_path = os.path.join(pdf_dev_path, 'mineru')
  96. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  97. reader1 = FileBasedDataReader("")
  98. pdf_bytes = reader1.read(pdf_path)
  99. ds = PymuDocDataset(pdf_bytes)
  100. ## inference
  101. if ds.classify() == SupportedPdfParseMethod.OCR:
  102. infer_result = ds.apply(doc_analyze, ocr=True)
  103. ## pipeline
  104. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  105. else:
  106. infer_result = ds.apply(doc_analyze, ocr=False)
  107. ## pipeline
  108. pipe_result = infer_result.pipe_txt_mode(image_writer)
  109. common.delete_file(dir_path)
  110. ### draw model result on each page
  111. infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
  112. ### get model inference result
  113. model_inference_result = infer_result.get_infer_res()
  114. ### draw layout result on each page
  115. pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
  116. ### draw spans result on each page
  117. pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
  118. ### dump markdown
  119. pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  120. ### dump content list
  121. pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
  122. ### get markdown content
  123. md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
  124. ### get content list content
  125. content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
  126. ### get middle json
  127. middle_json_content = pipe_result.get_middle_json()
  128. common.sdk_count_folders_and_check_contents(dir_path)
  129. @pytest.mark.P0
  130. def test_pdf_local_ppt(self):
  131. """pdf sdk auto test."""
  132. demo_names = list()
  133. pdf_path = os.path.join(pdf_dev_path, 'ppt')
  134. for pdf_file in os.listdir(pdf_path):
  135. if pdf_file.endswith('.pptx'):
  136. demo_names.append(pdf_file.split('.')[0])
  137. for demo_name in demo_names:
  138. pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
  139. local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
  140. image_dir = str(os.path.basename(local_image_dir))
  141. name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
  142. dir_path = os.path.join(pdf_dev_path, 'mineru')
  143. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  144. ds = read_local_office(pdf_path)[0]
  145. common.delete_file(dir_path)
  146. ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  147. common.sdk_count_folders_and_check_contents(dir_path)
  148. @pytest.mark.P0
  149. def test_pdf_local_image(self):
  150. """pdf sdk auto test."""
  151. demo_names = list()
  152. pdf_path = os.path.join(pdf_dev_path, 'images')
  153. for pdf_file in os.listdir(pdf_path):
  154. if pdf_file.endswith('.jpg'):
  155. demo_names.append(pdf_file.split('.')[0])
  156. for demo_name in demo_names:
  157. pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
  158. local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
  159. image_dir = str(os.path.basename(local_image_dir))
  160. name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
  161. dir_path = os.path.join(pdf_dev_path, 'mineru')
  162. common.delete_file(dir_path)
  163. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  164. ds = read_local_images(pdf_path)[0]
  165. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
  166. md_writer, f"{name_without_suff}.md", image_dir)
  167. common.sdk_count_folders_and_check_contents(dir_path)
  168. @pytest.mark.P0
  169. def test_local_image_dir(self):
  170. """local image dir."""
  171. demo_names = list()
  172. pdf_path = os.path.join(pdf_dev_path, 'images')
  173. dir_path = os.path.join(pdf_dev_path, 'mineru')
  174. local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
  175. image_dir = str(os.path.basename(local_image_dir))
  176. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  177. common.delete_file(dir_path)
  178. dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
  179. count = 0
  180. for ds in dss:
  181. ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
  182. count += 1
  183. common.sdk_count_folders_and_check_contents(dir_path)
  184. def test_local_doc_parse(self):
  185. """
  186. doc 解析
  187. """
  188. demo_names = list()
  189. pdf_path = os.path.join(pdf_dev_path, 'doc')
  190. for pdf_file in os.listdir(pdf_path):
  191. if pdf_file.endswith('.docx'):
  192. demo_names.append(pdf_file.split('.')[0])
  193. for demo_name in demo_names:
  194. pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
  195. local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
  196. image_dir = str(os.path.basename(local_image_dir))
  197. name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
  198. dir_path = os.path.join(pdf_dev_path, 'mineru')
  199. image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
  200. ds = read_local_office(pdf_path)[0]
  201. common.delete_file(dir_path)
  202. ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
  203. common.sdk_count_folders_and_check_contents(dir_path)
  204. @pytest.mark.P0
  205. def test_pdf_cli_auto(self):
  206. """magic_pdf cli test auto."""
  207. time.sleep(2)
  208. demo_names = []
  209. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  210. for pdf_file in os.listdir(pdf_path):
  211. if pdf_file.endswith('.pdf'):
  212. demo_names.append(pdf_file.split('.')[0])
  213. for demo_name in demo_names:
  214. res_path = os.path.join(pdf_dev_path, 'mineru')
  215. common.delete_file(res_path)
  216. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  217. pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
  218. logging.info(cmd)
  219. os.system(cmd)
  220. common.cli_count_folders_and_check_contents(
  221. os.path.join(res_path, demo_name, 'auto'))
  222. @pytest.mark.P0
  223. def test_pdf_cli_txt(self):
  224. """magic_pdf cli test txt."""
  225. time.sleep(2)
  226. demo_names = []
  227. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  228. for pdf_file in os.listdir(pdf_path):
  229. if pdf_file.endswith('.pdf'):
  230. demo_names.append(pdf_file.split('.')[0])
  231. for demo_name in demo_names:
  232. res_path = os.path.join(pdf_dev_path, 'mineru')
  233. common.delete_file(res_path)
  234. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  235. pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
  236. logging.info(cmd)
  237. os.system(cmd)
  238. common.cli_count_folders_and_check_contents(
  239. os.path.join(res_path, demo_name, 'txt'))
  240. @pytest.mark.P0
  241. def test_pdf_cli_ocr(self):
  242. """magic_pdf cli test ocr."""
  243. time.sleep(2)
  244. demo_names = []
  245. pdf_path = os.path.join(pdf_dev_path, 'pdf')
  246. for pdf_file in os.listdir(pdf_path):
  247. if pdf_file.endswith('.pdf'):
  248. demo_names.append(pdf_file.split('.')[0])
  249. for demo_name in demo_names:
  250. res_path = os.path.join(pdf_dev_path, 'mineru')
  251. common.delete_file(res_path)
  252. cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
  253. pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
  254. logging.info(cmd)
  255. os.system(cmd)
  256. common.cli_count_folders_and_check_contents(
  257. os.path.join(res_path, demo_name, 'ocr'))
  258. @pytest.mark.skip(reason='out-of-date api')
  259. @pytest.mark.P1
  260. def test_pdf_dev_cli_local_jsonl_txt(self):
  261. """magic_pdf_dev cli local txt."""
  262. time.sleep(2)
  263. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  264. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  265. logging.info(cmd)
  266. os.system(cmd)
  267. @pytest.mark.skip(reason='out-of-date api')
  268. @pytest.mark.P1
  269. def test_pdf_dev_cli_local_jsonl_ocr(self):
  270. """magic_pdf_dev cli local ocr."""
  271. time.sleep(2)
  272. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  273. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  274. logging.info(cmd)
  275. os.system(cmd)
  276. @pytest.mark.skip(reason='out-of-date api')
  277. @pytest.mark.P1
  278. def test_pdf_dev_cli_local_jsonl_auto(self):
  279. """magic_pdf_dev cli local auto."""
  280. time.sleep(2)
  281. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  282. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  283. logging.info(cmd)
  284. os.system(cmd)
  285. @pytest.mark.skip(reason='out-of-date api')
  286. @pytest.mark.P1
  287. def test_pdf_dev_cli_s3_jsonl_txt(self):
  288. """magic_pdf_dev cli s3 txt."""
  289. time.sleep(2)
  290. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  291. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
  292. logging.info(cmd)
  293. os.system(cmd)
  294. @pytest.mark.skip(reason='out-of-date api')
  295. @pytest.mark.P1
  296. def test_pdf_dev_cli_s3_jsonl_ocr(self):
  297. """magic_pdf_dev cli s3 ocr."""
  298. time.sleep(2)
  299. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  300. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
  301. logging.info(cmd)
  302. os.system(cmd)
  303. @pytest.mark.skip(reason='out-of-date api')
  304. @pytest.mark.P1
  305. def test_pdf_dev_cli_s3_jsonl_auto(self):
  306. """magic_pdf_dev cli s3 auto."""
  307. time.sleep(2)
  308. jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
  309. cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
  310. logging.info(cmd)
  311. os.system(cmd)
  312. @pytest.mark.P1
  313. def test_pdf_dev_cli_pdf_json_auto(self):
  314. """magic_pdf_dev cli pdf+json auto."""
  315. time.sleep(2)
  316. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  317. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
  318. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  319. logging.info(cmd)
  320. os.system(cmd)
  321. @pytest.mark.skip(reason='out-of-date api')
  322. @pytest.mark.P1
  323. def test_pdf_dev_cli_pdf_json_ocr(self):
  324. """magic_pdf_dev cli pdf+json ocr."""
  325. time.sleep(2)
  326. json_path = os.path.join(pdf_dev_path, 'test_model.json')
  327. pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
  328. cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
  329. logging.info(cmd)
  330. os.system(cmd)
  331. @pytest.mark.P1
  332. def test_s3_sdk_auto(self):
  333. """
  334. test s3 sdk auto.
  335. """
  336. time.sleep(2)
  337. pdf_ak = os.getenv('pdf_ak')
  338. print (pdf_ak)
  339. pdf_sk = os.environ.get('pdf_sk', "")
  340. pdf_bucket = os.environ.get('bucket', "")
  341. pdf_endpoint = os.environ.get('pdf_endpoint', "")
  342. s3_pdf_path = conf.conf["s3_pdf_path"]
  343. image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
  344. prefix = "mineru/test/output"
  345. reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
  346. writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
  347. # = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
  348. image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
  349. local_dir = "output"
  350. name_without_suff = os.path.basename(s3_pdf_path).split(".")[0]
  351. # read bytes
  352. pdf_bytes = reader.read(s3_pdf_path) # read the pdf content
  353. # proc
  354. ## Create Dataset Instance
  355. ds = PymuDocDataset(pdf_bytes)
  356. ## inference
  357. if ds.classify() == SupportedPdfParseMethod.OCR:
  358. infer_result = ds.apply(doc_analyze, ocr=True)
  359. ## pipeline
  360. pipe_result = infer_result.pipe_ocr_mode(image_writer)
  361. else:
  362. infer_result = ds.apply(doc_analyze, ocr=False)
  363. ## pipeline
  364. pipe_result = infer_result.pipe_txt_mode(image_writer)
  365. ### draw model result on each page
  366. infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
  367. ### draw layout result on each page
  368. pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
  369. ### draw spans result on each page
  370. pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
  371. ### dump markdown
  372. pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
  373. ### dump content list
  374. pipe_result.dump_content_list(writer, f"{name_without_suff}_content_list.json", image_dir)
  375. @pytest.mark.P1
  376. def test_local_magic_pdf_open_st_table(self):
  377. """magic pdf cli open st table."""
  378. time.sleep(2)
  379. #pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
  380. value = {
  381. "model": "struct_eqtable",
  382. "enable": True,
  383. "max_time": 400
  384. }
  385. common.update_config_file(magic_pdf_config, "table-config", value)
  386. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  387. common.delete_file(pdf_res_path)
  388. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  389. os.system(cli_cmd)
  390. res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  391. assert res is True
  392. @pytest.mark.P1
  393. def test_local_magic_pdf_open_tablemaster_cuda(self):
  394. """magic pdf cli open table master html table cuda mode."""
  395. time.sleep(2)
  396. #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
  397. #os.system(pre_cmd)
  398. value = {
  399. "model": "tablemaster",
  400. "enable": True,
  401. "max_time": 400
  402. }
  403. common.update_config_file(magic_pdf_config, "table-config", value)
  404. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  405. common.delete_file(pdf_res_path)
  406. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  407. os.system(cli_cmd)
  408. res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  409. assert res is True
  410. @pytest.mark.P1
  411. def test_local_magic_pdf_open_rapidai_table(self):
  412. """magic pdf cli open rapid ai table."""
  413. time.sleep(2)
  414. #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
  415. #os.system(pre_cmd)
  416. value = {
  417. "model": "rapid_table",
  418. "enable": True,
  419. "max_time": 400
  420. }
  421. common.update_config_file(magic_pdf_config, "table-config", value)
  422. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  423. common.delete_file(pdf_res_path)
  424. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  425. os.system(cli_cmd)
  426. res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  427. assert res is True
  428. @pytest.mark.P1
  429. def test_local_magic_pdf_doclayout_yolo(self):
  430. """magic pdf cli open doclyaout yolo."""
  431. time.sleep(2)
  432. #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
  433. #os.system(pre_cmd)
  434. value = {
  435. "model": "doclayout_yolo"
  436. }
  437. common.update_config_file(magic_pdf_config, "layout-config", value)
  438. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  439. common.delete_file(pdf_res_path)
  440. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  441. os.system(cli_cmd)
  442. common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
  443. @pytest.mark.P1
  444. def test_local_magic_pdf_layoutlmv3_yolo(self):
  445. """magic pdf cli open layoutlmv3."""
  446. time.sleep(2)
  447. value = {
  448. "model": "layoutlmv3"
  449. }
  450. common.update_config_file(magic_pdf_config, "layout-config", value)
  451. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  452. common.delete_file(pdf_res_path)
  453. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  454. os.system(cli_cmd)
  455. common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
  456. #res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  457. @pytest.mark.P1
  458. def test_magic_pdf_cpu(self):
  459. """magic pdf cli cpu mode."""
  460. time.sleep(2)
  461. #pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
  462. #os.system(pre_cmd)
  463. value = {
  464. "model": "tablemaster",
  465. "enable": False,
  466. "max_time": 400
  467. }
  468. common.update_config_file(magic_pdf_config, "table-config", value)
  469. common.update_config_file(magic_pdf_config, "device-mode", "cpu")
  470. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  471. common.delete_file(pdf_res_path)
  472. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  473. os.system(cli_cmd)
  474. common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
  475. @pytest.mark.P1
  476. def test_local_magic_pdf_close_html_table(self):
  477. """magic pdf cli close table."""
  478. time.sleep(2)
  479. #pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
  480. #os.system(pre_cmd)
  481. value = {
  482. "model": "tablemaster",
  483. "enable": False,
  484. "max_time": 400
  485. }
  486. common.update_config_file(magic_pdf_config, "table-config", value)
  487. pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
  488. common.delete_file(pdf_res_path)
  489. cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
  490. os.system(cli_cmd)
  491. res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
  492. assert res is True
  493. if __name__ == '__main__':
  494. pytest.main()