pipeline.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # coding=utf8
  2. import sys
  3. import time
  4. from urllib.parse import quote
  5. from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
  6. from magic_pdf.libs.drop_reason import DropReason
  7. from magic_pdf.libs.json_compressor import JsonCompressor
  8. from magic_pdf.dict2md.mkcontent import mk_nlp_markdown
  9. from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
  10. from magic_pdf.filter.pdf_classify_by_type import classify
  11. from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
  12. from loguru import logger
  13. from app.common.s3 import get_s3_config, get_s3_client
  14. def exception_handler(jso: dict, e):
  15. logger.exception(e)
  16. jso['need_drop'] = True
  17. jso['drop_reason'] = DropReason.Exception
  18. jso['exception'] = f"ERROR: {e}"
  19. return jso
  20. def meta_scan(jso: dict, doc_layout_check=True) -> dict:
  21. s3_pdf_path = jso.get('file_location')
  22. s3_config = get_s3_config(s3_pdf_path)
  23. if doc_layout_check:
  24. if 'doc_layout_result' not in jso: # 检测json中是存在模型数据,如果没有则需要跳过该pdf
  25. jso['need_drop'] = True
  26. jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
  27. return jso
  28. try:
  29. data_source = jso.get('data_source')
  30. file_id = jso.get('file_id')
  31. book_name = data_source + "/" + file_id
  32. # 首页存在超量drawing问题
  33. # special_pdf_list = ['zlib/zlib_21822650']
  34. # if book_name in special_pdf_list:
  35. # jso['need_drop'] = True
  36. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  37. # return jso
  38. start_time = time.time() # 记录开始时间
  39. logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
  40. file_content = read_file(s3_pdf_path, s3_config)
  41. read_file_time = int(time.time() - start_time) # 计算执行时间
  42. start_time = time.time() # 记录开始时间
  43. res = pdf_meta_scan(s3_pdf_path, file_content)
  44. if res.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  45. jso['need_drop'] = True
  46. jso['drop_reason'] = res["drop_reason"]
  47. else: # 正常返回
  48. jso['pdf_meta'] = res
  49. jso['content'] = ""
  50. jso['remark'] = ""
  51. jso['data_url'] = ""
  52. end_time = time.time() # 记录结束时间
  53. meta_scan_time = int(end_time - start_time) # 计算执行时间
  54. logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}", file=sys.stderr)
  55. jso['read_file_time'] = read_file_time
  56. jso['meta_scan_time'] = meta_scan_time
  57. except Exception as e:
  58. jso = exception_handler(jso, e)
  59. return jso
  60. def classify_by_type(jso: dict, debug_mode=False) -> dict:
  61. #检测debug开关
  62. if debug_mode:
  63. pass
  64. else:# 如果debug没开,则检测是否有needdrop字段
  65. if jso.get('need_drop', False):
  66. return jso
  67. # 开始正式逻辑
  68. try:
  69. pdf_meta = jso.get('pdf_meta')
  70. data_source = jso.get('data_source')
  71. file_id = jso.get('file_id')
  72. book_name = data_source + "/" + file_id
  73. total_page = pdf_meta["total_page"]
  74. page_width = pdf_meta["page_width_pts"]
  75. page_height = pdf_meta["page_height_pts"]
  76. img_sz_list = pdf_meta["image_info_per_page"]
  77. img_num_list = pdf_meta['imgs_per_page']
  78. text_len_list = pdf_meta['text_len_per_page']
  79. text_layout_list = pdf_meta['text_layout_per_page']
  80. text_language = pdf_meta['text_language']
  81. # allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  82. # if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  83. # jso['need_drop'] = True
  84. # jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  85. # return jso
  86. pdf_path = pdf_meta['pdf_path']
  87. is_encrypted = pdf_meta['is_encrypted']
  88. is_needs_password = pdf_meta['is_needs_password']
  89. if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
  90. jso['need_drop'] = True
  91. jso['drop_reason'] = DropReason.ENCRYPTED
  92. else:
  93. start_time = time.time() # 记录开始时间
  94. is_text_pdf, results = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, img_num_list, text_layout_list)
  95. classify_time = int(time.time() - start_time) # 计算执行时间
  96. if is_text_pdf:
  97. pdf_meta['is_text_pdf'] = is_text_pdf
  98. jso['pdf_meta'] = pdf_meta
  99. jso['classify_time'] = classify_time
  100. # print(json.dumps(pdf_meta, ensure_ascii=False))
  101. allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  102. if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  103. jso['need_drop'] = True
  104. jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  105. return jso
  106. else:
  107. # 先不drop
  108. pdf_meta['is_text_pdf'] = is_text_pdf
  109. jso['pdf_meta'] = pdf_meta
  110. jso['classify_time'] = classify_time
  111. jso['need_drop'] = True
  112. jso['drop_reason'] = DropReason.NOT_IS_TEXT_PDF
  113. extra_info = {"classify_rules": []}
  114. for condition, result in results.items():
  115. if not result:
  116. extra_info["classify_rules"].append(condition)
  117. jso['extra_info'] = extra_info
  118. except Exception as e:
  119. jso = exception_handler(jso, e)
  120. return jso
  121. def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
  122. if debug_mode:
  123. pass
  124. else:# 如果debug没开,则检测是否有needdrop字段
  125. if jso.get('need_drop', False):
  126. logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
  127. jso["dropped"] = True
  128. return jso
  129. try:
  130. data_source = jso.get('data_source')
  131. file_id = jso.get('file_id')
  132. book_name = data_source + "/" + file_id
  133. title = jso.get('title')
  134. url_encode_title = quote(title, safe='')
  135. if data_source != 'scihub':
  136. return jso
  137. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  138. # 将 pdf_intermediate_dict 解压
  139. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  140. i = 0
  141. for page in pdf_intermediate_dict.values():
  142. if page.get('tables'):
  143. if len(page['tables']) > 0:
  144. j = 0
  145. for table in page['tables']:
  146. if debug_mode:
  147. image_path = join_path("s3://mllm-raw-media/pdf2md_img/", book_name, table['image_path'])
  148. else:
  149. image_path = join_path("s3://mllm-raw-media/pdf2md_img/", table['image_path'])
  150. if image_path.endswith('.jpg'):
  151. j += 1
  152. s3_client = get_s3_client(image_path)
  153. bucket_name, bucket_key = parse_bucket_key(image_path)
  154. # 通过s3_client获取图片到内存
  155. image_bytes = s3_client.get_object(Bucket=bucket_name, Key=bucket_key)['Body'].read()
  156. # 保存图片到新的位置
  157. if debug_mode:
  158. new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + "_" + table['image_path'].lstrip('tables/'))
  159. else:
  160. new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + f"_page{i}_{j}.jpg")
  161. logger.info(new_image_path, file=sys.stderr)
  162. bucket_name, bucket_key = parse_bucket_key(new_image_path)
  163. s3_client.put_object(Bucket=bucket_name, Key=bucket_key, Body=image_bytes)
  164. else:
  165. continue
  166. i += 1
  167. # 把无用的信息清空
  168. jso["doc_layout_result"] = ""
  169. jso["pdf_intermediate_dict"] = ""
  170. jso["pdf_meta"] = ""
  171. except Exception as e:
  172. jso = exception_handler(jso, e)
  173. return jso
  174. def drop_needdrop_pdf(jso: dict) -> dict:
  175. if jso.get('need_drop', False):
  176. logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
  177. jso["dropped"] = True
  178. return jso
  179. def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  180. if debug_mode:
  181. pass
  182. else:# 如果debug没开,则检测是否有needdrop字段
  183. if jso.get('need_drop', False):
  184. book_name = join_path(jso['data_source'], jso['file_id'])
  185. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  186. jso["dropped"] = True
  187. return jso
  188. try:
  189. pdf_intermediate_dict = jso['pdf_intermediate_dict']
  190. # 将 pdf_intermediate_dict 解压
  191. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  192. markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
  193. jso["content"] = markdown_content
  194. logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
  195. # 把无用的信息清空
  196. jso["doc_layout_result"] = ""
  197. jso["pdf_intermediate_dict"] = ""
  198. jso["pdf_meta"] = ""
  199. except Exception as e:
  200. jso = exception_handler(jso, e)
  201. return jso
  202. def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  203. #检测debug开关
  204. if debug_mode:
  205. pass
  206. else:# 如果debug没开,则检测是否有needdrop字段
  207. if jso.get('need_drop', False):
  208. return jso
  209. # 开始正式逻辑
  210. s3_pdf_path = jso.get('file_location')
  211. s3_config = get_s3_config(s3_pdf_path)
  212. model_output_json_list = jso.get('doc_layout_result')
  213. data_source = jso.get('data_source')
  214. file_id = jso.get('file_id')
  215. book_name = data_source + "/" + file_id
  216. # 1.23.22已修复
  217. # if debug_mode:
  218. # pass
  219. # else:
  220. # if book_name == "zlib/zlib_21929367":
  221. # jso['need_drop'] = True
  222. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  223. # return jso
  224. junk_img_bojids = jso['pdf_meta']['junk_img_bojids']
  225. # total_page = jso['pdf_meta']['total_page']
  226. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  227. svgs_per_page_list = jso['pdf_meta']['svgs_per_page']
  228. max_svgs = max(svgs_per_page_list)
  229. if max_svgs > 3000:
  230. jso['need_drop'] = True
  231. jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  232. # elif total_page > 1000:
  233. # jso['need_drop'] = True
  234. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  235. else:
  236. try:
  237. save_path = "s3://mllm-raw-media/pdf2md_img/"
  238. image_s3_config = get_s3_config(save_path)
  239. start_time = time.time() # 记录开始时间
  240. # 先打印一下book_name和解析开始的时间
  241. logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
  242. pdf_info_dict = parse_pdf_by_model(s3_pdf_path, s3_config, model_output_json_list, save_path,
  243. book_name, pdf_model_profile=None,
  244. image_s3_config=image_s3_config,
  245. start_page_id=start_page_id, junk_img_bojids=junk_img_bojids,
  246. debug_mode=debug_mode)
  247. if pdf_info_dict.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  248. jso['need_drop'] = True
  249. jso['drop_reason'] = pdf_info_dict["drop_reason"]
  250. else: # 正常返回,将 pdf_info_dict 压缩并存储
  251. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  252. jso['pdf_intermediate_dict'] = pdf_info_dict
  253. end_time = time.time() # 记录完成时间
  254. parse_time = int(end_time - start_time) # 计算执行时间
  255. # 解析完成后打印一下book_name和耗时
  256. logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr)
  257. jso['parse_time'] = parse_time
  258. except Exception as e:
  259. jso = exception_handler(jso, e)
  260. return jso
  261. if __name__ == "__main__":
  262. pass