test_e2e.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import copy
  3. import json
  4. import os
  5. from pathlib import Path
  6. from cryptography.hazmat.backends.openssl import backend
  7. from loguru import logger
  8. from mineru.cli.common import (
  9. convert_pdf_bytes_to_bytes_by_pypdfium2,
  10. prepare_env,
  11. read_fn,
  12. )
  13. from mineru.data.data_reader_writer import FileBasedDataWriter
  14. from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
  15. from mineru.utils.enum_class import MakeMode
  16. from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
  17. from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
  18. from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
  19. union_make as pipeline_union_make,
  20. )
  21. from mineru.backend.pipeline.model_json_to_middle_json import (
  22. result_to_middle_json as pipeline_result_to_middle_json,
  23. )
  24. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  25. from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
  26. class TestE2E:
  27. def test_pipeline_with_two_config(self):
  28. def do_parse(
  29. output_dir, # Output directory for storing parsing results
  30. pdf_file_names: list[str], # List of PDF file names to be parsed
  31. pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
  32. p_lang_list: list[
  33. str
  34. ], # List of languages for each PDF, default is 'ch' (Chinese)
  35. parse_method="auto", # The method for parsing PDF, default is 'auto'
  36. formula_enable=True, # Enable formula parsing
  37. table_enable=True, # Enable table parsing
  38. f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
  39. f_draw_span_bbox=True, # Whether to draw span bounding boxes
  40. f_dump_md=True, # Whether to dump markdown files
  41. f_dump_middle_json=True, # Whether to dump middle JSON files
  42. f_dump_model_output=True, # Whether to dump model output files
  43. f_dump_orig_pdf=True, # Whether to dump original PDF files
  44. f_dump_content_list=True, # Whether to dump content list files
  45. f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
  46. start_page_id=0, # Start page ID for parsing, default is 0
  47. end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
  48. ):
  49. for idx, pdf_bytes in enumerate(pdf_bytes_list):
  50. new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
  51. pdf_bytes, start_page_id, end_page_id
  52. )
  53. pdf_bytes_list[idx] = new_pdf_bytes
  54. (
  55. infer_results,
  56. all_image_lists,
  57. all_pdf_docs,
  58. lang_list,
  59. ocr_enabled_list,
  60. ) = pipeline_doc_analyze(
  61. pdf_bytes_list,
  62. p_lang_list,
  63. parse_method=parse_method,
  64. formula_enable=formula_enable,
  65. table_enable=table_enable,
  66. )
  67. for idx, model_list in enumerate(infer_results):
  68. model_json = copy.deepcopy(model_list)
  69. pdf_file_name = pdf_file_names[idx]
  70. local_image_dir, local_md_dir = prepare_env(
  71. output_dir, pdf_file_name, parse_method
  72. )
  73. image_writer, md_writer = FileBasedDataWriter(
  74. local_image_dir
  75. ), FileBasedDataWriter(local_md_dir)
  76. images_list = all_image_lists[idx]
  77. pdf_doc = all_pdf_docs[idx]
  78. _lang = lang_list[idx]
  79. _ocr_enable = ocr_enabled_list[idx]
  80. middle_json = pipeline_result_to_middle_json(
  81. model_list,
  82. images_list,
  83. pdf_doc,
  84. image_writer,
  85. _lang,
  86. _ocr_enable,
  87. formula_enable,
  88. )
  89. pdf_info = middle_json["pdf_info"]
  90. pdf_bytes = pdf_bytes_list[idx]
  91. if f_draw_layout_bbox:
  92. draw_layout_bbox(
  93. pdf_info,
  94. pdf_bytes,
  95. local_md_dir,
  96. f"{pdf_file_name}_layout.pdf",
  97. )
  98. if f_draw_span_bbox:
  99. draw_span_bbox(
  100. pdf_info,
  101. pdf_bytes,
  102. local_md_dir,
  103. f"{pdf_file_name}_span.pdf",
  104. )
  105. if f_dump_orig_pdf:
  106. md_writer.write(
  107. f"{pdf_file_name}_origin.pdf",
  108. pdf_bytes,
  109. )
  110. if f_dump_md:
  111. image_dir = str(os.path.basename(local_image_dir))
  112. md_content_str = pipeline_union_make(
  113. pdf_info, f_make_md_mode, image_dir
  114. )
  115. md_writer.write_string(
  116. f"{pdf_file_name}.md",
  117. md_content_str,
  118. )
  119. if f_dump_content_list:
  120. image_dir = str(os.path.basename(local_image_dir))
  121. content_list = pipeline_union_make(
  122. pdf_info, MakeMode.CONTENT_LIST, image_dir
  123. )
  124. md_writer.write_string(
  125. f"{pdf_file_name}_content_list.json",
  126. json.dumps(content_list, ensure_ascii=False, indent=4),
  127. )
  128. if f_dump_middle_json:
  129. md_writer.write_string(
  130. f"{pdf_file_name}_middle.json",
  131. json.dumps(middle_json, ensure_ascii=False, indent=4),
  132. )
  133. if f_dump_model_output:
  134. md_writer.write_string(
  135. f"{pdf_file_name}_model.json",
  136. json.dumps(model_json, ensure_ascii=False, indent=4),
  137. )
  138. logger.info(f"local output dir is {local_md_dir}")
  139. def parse_doc(
  140. path_list: list[Path],
  141. output_dir,
  142. lang="ch",
  143. method="auto",
  144. start_page_id=0,
  145. end_page_id=None,
  146. ):
  147. file_name_list = []
  148. pdf_bytes_list = []
  149. lang_list = []
  150. for path in path_list:
  151. file_name = str(Path(path).stem)
  152. pdf_bytes = read_fn(path)
  153. file_name_list.append(file_name)
  154. pdf_bytes_list.append(pdf_bytes)
  155. lang_list.append(lang)
  156. # 运行两次 do_parse,分别是开启公式和表格解析和不开启
  157. do_parse(
  158. output_dir=output_dir,
  159. pdf_file_names=file_name_list,
  160. pdf_bytes_list=pdf_bytes_list,
  161. p_lang_list=lang_list,
  162. parse_method=method,
  163. start_page_id=start_page_id,
  164. end_page_id=end_page_id,
  165. )
  166. do_parse(
  167. output_dir=output_dir,
  168. pdf_file_names=file_name_list,
  169. pdf_bytes_list=pdf_bytes_list,
  170. p_lang_list=lang_list,
  171. parse_method=method,
  172. table_enable=False,
  173. formula_enable=False,
  174. start_page_id=start_page_id,
  175. end_page_id=end_page_id,
  176. )
  177. __dir__ = os.path.dirname(os.path.abspath(__file__))
  178. pdf_files_dir = os.path.join(__dir__, "pdfs")
  179. output_dir = os.path.join(__dir__, "output")
  180. pdf_suffixes = [".pdf"]
  181. image_suffixes = [".png", ".jpeg", ".jpg"]
  182. doc_path_list = []
  183. for doc_path in Path(pdf_files_dir).glob("*"):
  184. if doc_path.suffix in pdf_suffixes + image_suffixes:
  185. doc_path_list.append(doc_path)
  186. # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  187. parse_doc(doc_path_list, output_dir)
  188. def test_vlm_transformers_with_default_config(self):
  189. def do_parse(
  190. output_dir, # Output directory for storing parsing results
  191. pdf_file_names: list[str], # List of PDF file names to be parsed
  192. pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
  193. server_url=None, # Server URL for vlm-sglang-client backend
  194. f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
  195. f_dump_md=True, # Whether to dump markdown files
  196. f_dump_middle_json=True, # Whether to dump middle JSON files
  197. f_dump_model_output=True, # Whether to dump model output files
  198. f_dump_orig_pdf=True, # Whether to dump original PDF files
  199. f_dump_content_list=True, # Whether to dump content list files
  200. f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
  201. start_page_id=0, # Start page ID for parsing, default is 0
  202. end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
  203. ):
  204. backend = "transformers"
  205. f_draw_span_bbox = False
  206. parse_method = "vlm"
  207. for idx, pdf_bytes in enumerate(pdf_bytes_list):
  208. pdf_file_name = pdf_file_names[idx]
  209. pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
  210. pdf_bytes, start_page_id, end_page_id
  211. )
  212. local_image_dir, local_md_dir = prepare_env(
  213. output_dir, pdf_file_name, parse_method
  214. )
  215. image_writer, md_writer = FileBasedDataWriter(
  216. local_image_dir
  217. ), FileBasedDataWriter(local_md_dir)
  218. middle_json, infer_result = vlm_doc_analyze(
  219. pdf_bytes,
  220. image_writer=image_writer,
  221. backend=backend,
  222. server_url=server_url,
  223. )
  224. pdf_info = middle_json["pdf_info"]
  225. if f_draw_layout_bbox:
  226. draw_layout_bbox(
  227. pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf"
  228. )
  229. if f_draw_span_bbox:
  230. draw_span_bbox(
  231. pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf"
  232. )
  233. if f_dump_orig_pdf:
  234. md_writer.write(
  235. f"{pdf_file_name}_origin.pdf",
  236. pdf_bytes,
  237. )
  238. if f_dump_md:
  239. image_dir = str(os.path.basename(local_image_dir))
  240. md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
  241. md_writer.write_string(
  242. f"{pdf_file_name}.md",
  243. md_content_str,
  244. )
  245. if f_dump_content_list:
  246. image_dir = str(os.path.basename(local_image_dir))
  247. content_list = vlm_union_make(
  248. pdf_info, MakeMode.CONTENT_LIST, image_dir
  249. )
  250. md_writer.write_string(
  251. f"{pdf_file_name}_content_list.json",
  252. json.dumps(content_list, ensure_ascii=False, indent=4),
  253. )
  254. if f_dump_middle_json:
  255. md_writer.write_string(
  256. f"{pdf_file_name}_middle.json",
  257. json.dumps(middle_json, ensure_ascii=False, indent=4),
  258. )
  259. if f_dump_model_output:
  260. model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
  261. md_writer.write_string(
  262. f"{pdf_file_name}_model_output.txt",
  263. model_output,
  264. )
  265. logger.info(f"local output dir is {local_md_dir}")
  266. def parse_doc(
  267. path_list: list[Path],
  268. output_dir,
  269. lang="ch",
  270. server_url=None,
  271. start_page_id=0,
  272. end_page_id=None,
  273. ):
  274. file_name_list = []
  275. pdf_bytes_list = []
  276. lang_list = []
  277. for path in path_list:
  278. file_name = str(Path(path).stem)
  279. pdf_bytes = read_fn(path)
  280. file_name_list.append(file_name)
  281. pdf_bytes_list.append(pdf_bytes)
  282. lang_list.append(lang)
  283. do_parse(
  284. output_dir=output_dir,
  285. pdf_file_names=file_name_list,
  286. pdf_bytes_list=pdf_bytes_list,
  287. server_url=server_url,
  288. start_page_id=start_page_id,
  289. end_page_id=end_page_id,
  290. )
  291. __dir__ = os.path.dirname(os.path.abspath(__file__))
  292. pdf_files_dir = os.path.join(__dir__, "pdfs")
  293. output_dir = os.path.join(__dir__, "output")
  294. pdf_suffixes = [".pdf"]
  295. image_suffixes = [".png", ".jpeg", ".jpg"]
  296. doc_path_list = []
  297. for doc_path in Path(pdf_files_dir).glob("*"):
  298. if doc_path.suffix in pdf_suffixes + image_suffixes:
  299. doc_path_list.append(doc_path)
  300. # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  301. parse_doc(doc_path_list, output_dir)