pipeline.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. # coding=utf8
  2. import sys
  3. import time
  4. from urllib.parse import quote
  5. from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format, \
  6. ocr_mk_mm_markdown_with_para
  7. from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path
  8. from magic_pdf.libs.drop_reason import DropReason
  9. from magic_pdf.libs.json_compressor import JsonCompressor
  10. from magic_pdf.dict2md.mkcontent import mk_nlp_markdown, mk_universal_format
  11. from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
  12. from magic_pdf.filter.pdf_classify_by_type import classify
  13. from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
  14. from loguru import logger
  15. from app.common.s3 import get_s3_config, get_s3_client
  16. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  17. def exception_handler(jso: dict, e):
  18. logger.exception(e)
  19. jso['need_drop'] = True
  20. jso['drop_reason'] = DropReason.Exception
  21. jso['exception'] = f"ERROR: {e}"
  22. return jso
  23. def get_data_type(jso: dict):
  24. data_type = jso.get('data_type')
  25. if data_type is None:
  26. data_type = jso.get('file_type')
  27. return data_type
  28. def get_bookid(jso: dict):
  29. book_id = jso.get('bookid')
  30. if book_id is None:
  31. book_id = jso.get('original_file_id')
  32. return book_id
  33. def get_data_source(jso: dict):
  34. data_source = jso.get('data_source')
  35. if data_source is None:
  36. data_source = jso.get('file_source')
  37. return data_source
  38. def meta_scan(jso: dict, doc_layout_check=True) -> dict:
  39. s3_pdf_path = jso.get('file_location')
  40. s3_config = get_s3_config(s3_pdf_path)
  41. if doc_layout_check:
  42. if 'doc_layout_result' not in jso: # 检测json中是存在模型数据,如果没有则需要跳过该pdf
  43. jso['need_drop'] = True
  44. jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
  45. return jso
  46. try:
  47. data_source = get_data_source(jso)
  48. file_id = jso.get('file_id')
  49. book_name = f"{data_source}/{file_id}"
  50. # 首页存在超量drawing问题
  51. # special_pdf_list = ['zlib/zlib_21822650']
  52. # if book_name in special_pdf_list:
  53. # jso['need_drop'] = True
  54. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  55. # return jso
  56. start_time = time.time() # 记录开始时间
  57. logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
  58. file_content = read_file(s3_pdf_path, s3_config)
  59. read_file_time = int(time.time() - start_time) # 计算执行时间
  60. start_time = time.time() # 记录开始时间
  61. res = pdf_meta_scan(s3_pdf_path, file_content)
  62. if res.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  63. jso['need_drop'] = True
  64. jso['drop_reason'] = res["drop_reason"]
  65. else: # 正常返回
  66. jso['pdf_meta'] = res
  67. jso['content'] = ""
  68. jso['remark'] = ""
  69. jso['data_url'] = ""
  70. end_time = time.time() # 记录结束时间
  71. meta_scan_time = int(end_time - start_time) # 计算执行时间
  72. logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}", file=sys.stderr)
  73. jso['read_file_time'] = read_file_time
  74. jso['meta_scan_time'] = meta_scan_time
  75. except Exception as e:
  76. jso = exception_handler(jso, e)
  77. return jso
  78. def classify_by_type(jso: dict, debug_mode=False) -> dict:
  79. #检测debug开关
  80. if debug_mode:
  81. pass
  82. else:# 如果debug没开,则检测是否有needdrop字段
  83. if jso.get('need_drop', False):
  84. return jso
  85. # 开始正式逻辑
  86. try:
  87. pdf_meta = jso.get('pdf_meta')
  88. data_source = get_data_source(jso)
  89. file_id = jso.get('file_id')
  90. book_name = f"{data_source}/{file_id}"
  91. total_page = pdf_meta["total_page"]
  92. page_width = pdf_meta["page_width_pts"]
  93. page_height = pdf_meta["page_height_pts"]
  94. img_sz_list = pdf_meta["image_info_per_page"]
  95. img_num_list = pdf_meta['imgs_per_page']
  96. text_len_list = pdf_meta['text_len_per_page']
  97. text_layout_list = pdf_meta['text_layout_per_page']
  98. text_language = pdf_meta['text_language']
  99. # allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  100. # if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  101. # jso['need_drop'] = True
  102. # jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  103. # return jso
  104. pdf_path = pdf_meta['pdf_path']
  105. is_encrypted = pdf_meta['is_encrypted']
  106. is_needs_password = pdf_meta['is_needs_password']
  107. if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
  108. jso['need_drop'] = True
  109. jso['drop_reason'] = DropReason.ENCRYPTED
  110. else:
  111. start_time = time.time() # 记录开始时间
  112. is_text_pdf, results = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, img_num_list, text_layout_list)
  113. classify_time = int(time.time() - start_time) # 计算执行时间
  114. if is_text_pdf:
  115. pdf_meta['is_text_pdf'] = is_text_pdf
  116. jso['pdf_meta'] = pdf_meta
  117. jso['classify_time'] = classify_time
  118. # print(json.dumps(pdf_meta, ensure_ascii=False))
  119. allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  120. if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  121. jso['need_drop'] = True
  122. jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  123. return jso
  124. else:
  125. # 先不drop
  126. pdf_meta['is_text_pdf'] = is_text_pdf
  127. jso['pdf_meta'] = pdf_meta
  128. jso['classify_time'] = classify_time
  129. jso['need_drop'] = True
  130. jso['drop_reason'] = DropReason.NOT_IS_TEXT_PDF
  131. extra_info = {"classify_rules": []}
  132. for condition, result in results.items():
  133. if not result:
  134. extra_info["classify_rules"].append(condition)
  135. jso['extra_info'] = extra_info
  136. except Exception as e:
  137. jso = exception_handler(jso, e)
  138. return jso
  139. def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
  140. if debug_mode:
  141. pass
  142. else:# 如果debug没开,则检测是否有needdrop字段
  143. if jso.get('need_drop', False):
  144. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
  145. jso["dropped"] = True
  146. return jso
  147. try:
  148. data_source = get_data_source(jso)
  149. file_id = jso.get('file_id')
  150. book_name = f"{data_source}/{file_id}"
  151. title = jso.get('title')
  152. url_encode_title = quote(title, safe='')
  153. if data_source != 'scihub':
  154. return jso
  155. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  156. # 将 pdf_intermediate_dict 解压
  157. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  158. i = 0
  159. for page in pdf_intermediate_dict.values():
  160. if page.get('tables'):
  161. if len(page['tables']) > 0:
  162. j = 0
  163. for table in page['tables']:
  164. if debug_mode:
  165. image_path = join_path("s3://mllm-raw-media/pdf2md_img/", book_name, table['image_path'])
  166. else:
  167. image_path = join_path("s3://mllm-raw-media/pdf2md_img/", table['image_path'])
  168. if image_path.endswith('.jpg'):
  169. j += 1
  170. s3_client = get_s3_client(image_path)
  171. bucket_name, bucket_key = parse_bucket_key(image_path)
  172. # 通过s3_client获取图片到内存
  173. image_bytes = s3_client.get_object(Bucket=bucket_name, Key=bucket_key)['Body'].read()
  174. # 保存图片到新的位置
  175. if debug_mode:
  176. new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + "_" + table['image_path'].lstrip('tables/'))
  177. else:
  178. new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + f"_page{i}_{j}.jpg")
  179. logger.info(new_image_path, file=sys.stderr)
  180. bucket_name, bucket_key = parse_bucket_key(new_image_path)
  181. s3_client.put_object(Bucket=bucket_name, Key=bucket_key, Body=image_bytes)
  182. else:
  183. continue
  184. i += 1
  185. # 把无用的信息清空
  186. jso["doc_layout_result"] = ""
  187. jso["pdf_intermediate_dict"] = ""
  188. jso["pdf_meta"] = ""
  189. except Exception as e:
  190. jso = exception_handler(jso, e)
  191. return jso
  192. def drop_needdrop_pdf(jso: dict) -> dict:
  193. if jso.get('need_drop', False):
  194. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
  195. jso["dropped"] = True
  196. return jso
  197. def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  198. if debug_mode:
  199. pass
  200. else:# 如果debug没开,则检测是否有needdrop字段
  201. if jso.get('need_drop', False):
  202. book_name = join_path(get_data_source(jso), jso['file_id'])
  203. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  204. jso["dropped"] = True
  205. return jso
  206. try:
  207. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  208. # 将 pdf_intermediate_dict 解压
  209. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  210. #markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
  211. jso['content_list'] = mk_universal_format(pdf_intermediate_dict)
  212. #jso["content"] = markdown_content
  213. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']}")
  214. # 把无用的信息清空
  215. jso["doc_layout_result"] = ""
  216. jso["pdf_intermediate_dict"] = ""
  217. jso["pdf_meta"] = ""
  218. except Exception as e:
  219. jso = exception_handler(jso, e)
  220. return jso
  221. def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  222. #检测debug开关
  223. if debug_mode:
  224. pass
  225. else:# 如果debug没开,则检测是否有needdrop字段
  226. if jso.get('need_drop', False):
  227. return jso
  228. # 开始正式逻辑
  229. s3_pdf_path = jso.get('file_location')
  230. s3_config = get_s3_config(s3_pdf_path)
  231. model_output_json_list = jso.get('doc_layout_result')
  232. data_source = get_data_source(jso)
  233. file_id = jso.get('file_id')
  234. book_name = f"{data_source}/{file_id}"
  235. # 1.23.22已修复
  236. # if debug_mode:
  237. # pass
  238. # else:
  239. # if book_name == "zlib/zlib_21929367":
  240. # jso['need_drop'] = True
  241. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  242. # return jso
  243. junk_img_bojids = jso['pdf_meta']['junk_img_bojids']
  244. # total_page = jso['pdf_meta']['total_page']
  245. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  246. svgs_per_page_list = jso['pdf_meta']['svgs_per_page']
  247. max_svgs = max(svgs_per_page_list)
  248. if max_svgs > 3000:
  249. jso['need_drop'] = True
  250. jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  251. # elif total_page > 1000:
  252. # jso['need_drop'] = True
  253. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  254. else:
  255. try:
  256. save_path = s3_image_save_path
  257. image_s3_config = get_s3_config(save_path)
  258. start_time = time.time() # 记录开始时间
  259. # 先打印一下book_name和解析开始的时间
  260. logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
  261. pdf_info_dict = parse_pdf_by_model(s3_pdf_path, s3_config, model_output_json_list, save_path,
  262. book_name, pdf_model_profile=None,
  263. image_s3_config=image_s3_config,
  264. start_page_id=start_page_id, junk_img_bojids=junk_img_bojids,
  265. debug_mode=debug_mode)
  266. if pdf_info_dict.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  267. jso['need_drop'] = True
  268. jso['drop_reason'] = pdf_info_dict["drop_reason"]
  269. else: # 正常返回,将 pdf_info_dict 压缩并存储
  270. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  271. jso['pdf_intermediate_dict'] = pdf_info_dict
  272. end_time = time.time() # 记录完成时间
  273. parse_time = int(end_time - start_time) # 计算执行时间
  274. # 解析完成后打印一下book_name和耗时
  275. logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr)
  276. jso['parse_time'] = parse_time
  277. except Exception as e:
  278. jso = exception_handler(jso, e)
  279. return jso
  280. '''
  281. 统一处理逻辑
  282. 1.先调用parse_pdf对文本类pdf进行处理
  283. 2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
  284. '''
  285. def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  286. jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  287. jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  288. return jso
  289. # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
  290. def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  291. if not jso.get('need_drop', False):
  292. return jso
  293. else:
  294. jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  295. jso['need_drop'] = False
  296. return jso
  297. def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  298. # 检测debug开关
  299. if debug_mode:
  300. pass
  301. else: # 如果debug没开,则检测是否有needdrop字段
  302. if jso.get('need_drop', False):
  303. return jso
  304. jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  305. return jso
  306. def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  307. s3_pdf_path = jso.get('file_location')
  308. s3_config = get_s3_config(s3_pdf_path)
  309. model_output_json_list = jso.get('doc_layout_result')
  310. data_source = get_data_source(jso)
  311. file_id = jso.get('file_id')
  312. book_name = f"{data_source}/{file_id}"
  313. try:
  314. save_path = s3_image_save_path
  315. image_s3_config = get_s3_config(save_path)
  316. start_time = time.time() # 记录开始时间
  317. # 先打印一下book_name和解析开始的时间
  318. logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
  319. pdf_info_dict = parse_pdf_by_ocr(
  320. s3_pdf_path,
  321. s3_config,
  322. model_output_json_list,
  323. save_path,
  324. book_name,
  325. pdf_model_profile=None,
  326. image_s3_config=image_s3_config,
  327. start_page_id=start_page_id,
  328. debug_mode=debug_mode
  329. )
  330. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  331. jso['pdf_intermediate_dict'] = pdf_info_dict
  332. end_time = time.time() # 记录完成时间
  333. parse_time = int(end_time - start_time) # 计算执行时间
  334. # 解析完成后打印一下book_name和耗时
  335. logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr)
  336. jso['parse_time'] = parse_time
  337. except Exception as e:
  338. jso = exception_handler(jso, e)
  339. return jso
  340. def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  341. if debug_mode:
  342. pass
  343. else: # 如果debug没开,则检测是否有needdrop字段
  344. if jso.get('need_drop', False):
  345. book_name = join_path(get_data_source(jso), jso['file_id'])
  346. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  347. jso["dropped"] = True
  348. return jso
  349. try:
  350. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  351. # 将 pdf_intermediate_dict 解压
  352. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  353. markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
  354. jso["content"] = markdown_content
  355. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
  356. # 把无用的信息清空
  357. jso["doc_layout_result"] = ""
  358. jso["pdf_intermediate_dict"] = ""
  359. jso["pdf_meta"] = ""
  360. except Exception as e:
  361. jso = exception_handler(jso, e)
  362. return jso
  363. def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(jso: dict, debug_mode=False) -> dict:
  364. if debug_mode:
  365. pass
  366. else: # 如果debug没开,则检测是否有needdrop字段
  367. if jso.get('need_drop', False):
  368. book_name = join_path(get_data_source(jso), jso['file_id'])
  369. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  370. jso["dropped"] = True
  371. return jso
  372. try:
  373. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  374. # 将 pdf_intermediate_dict 解压
  375. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  376. markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  377. jso["content_ocr"] = markdown_content
  378. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
  379. # 把无用的信息清空
  380. jso["doc_layout_result"] = ""
  381. jso["pdf_intermediate_dict"] = ""
  382. jso["pdf_meta"] = ""
  383. except Exception as e:
  384. jso = exception_handler(jso, e)
  385. return jso
  386. def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:
  387. if debug_mode:
  388. pass
  389. else: # 如果debug没开,则检测是否有needdrop字段
  390. if jso.get('need_drop', False):
  391. book_name = join_path(get_data_source(jso), jso['file_id'])
  392. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  393. jso["dropped"] = True
  394. return jso
  395. try:
  396. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  397. # 将 pdf_intermediate_dict 解压
  398. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  399. standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
  400. jso["content_list"] = standard_format
  401. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}", file=sys.stderr)
  402. # 把无用的信息清空
  403. jso["doc_layout_result"] = ""
  404. jso["pdf_intermediate_dict"] = ""
  405. jso["pdf_meta"] = ""
  406. except Exception as e:
  407. jso = exception_handler(jso, e)
  408. return jso
  409. if __name__ == "__main__":
  410. pass