pipeline.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. # coding=utf8
  2. import sys
  3. import time
  4. from urllib.parse import quote
  5. from magic_pdf.dict2md.ocr_mkcontent import (
  6. ocr_mk_nlp_markdown,
  7. ocr_mk_mm_markdown,
  8. ocr_mk_mm_standard_format,
  9. ocr_mk_mm_markdown_with_para,
  10. )
  11. from magic_pdf.libs.commons import (
  12. read_file,
  13. join_path,
  14. parse_bucket_key,
  15. formatted_time,
  16. s3_image_save_path,
  17. )
  18. from magic_pdf.libs.drop_reason import DropReason
  19. from magic_pdf.libs.json_compressor import JsonCompressor
  20. from magic_pdf.dict2md.mkcontent import mk_nlp_markdown, mk_universal_format
  21. from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
  22. from magic_pdf.filter.pdf_classify_by_type import classify
  23. from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
  24. from loguru import logger
  25. from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
  26. rom magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
  27. from app.common.s3 import get_s3_config, get_s3_client
  28. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  29. def exception_handler(jso: dict, e):
  30. logger.exception(e)
  31. jso["need_drop"] = True
  32. jso["drop_reason"] = DropReason.Exception
  33. jso["exception"] = f"ERROR: {e}"
  34. return jso
  35. def get_data_type(jso: dict):
  36. data_type = jso.get("data_type")
  37. if data_type is None:
  38. data_type = jso.get("file_type")
  39. return data_type
  40. def get_bookid(jso: dict):
  41. book_id = jso.get("bookid")
  42. if book_id is None:
  43. book_id = jso.get("original_file_id")
  44. return book_id
  45. def get_data_source(jso: dict):
  46. data_source = jso.get("data_source")
  47. if data_source is None:
  48. data_source = jso.get("file_source")
  49. return data_source
  50. def meta_scan(jso: dict, doc_layout_check=True) -> dict:
  51. s3_pdf_path = jso.get("file_location")
  52. s3_config = get_s3_config(s3_pdf_path)
  53. if doc_layout_check:
  54. if (
  55. "doc_layout_result" not in jso
  56. ): # 检测json中是存在模型数据,如果没有则需要跳过该pdf
  57. jso["need_drop"] = True
  58. jso["drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT
  59. return jso
  60. try:
  61. data_source = get_data_source(jso)
  62. file_id = jso.get("file_id")
  63. book_name = f"{data_source}/{file_id}"
  64. # 首页存在超量drawing问题
  65. # special_pdf_list = ['zlib/zlib_21822650']
  66. # if book_name in special_pdf_list:
  67. # jso['need_drop'] = True
  68. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  69. # return jso
  70. start_time = time.time() # 记录开始时间
  71. logger.info(
  72. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  73. file=sys.stderr,
  74. )
  75. file_content = read_file(s3_pdf_path, s3_config)
  76. read_file_time = int(time.time() - start_time) # 计算执行时间
  77. start_time = time.time() # 记录开始时间
  78. res = pdf_meta_scan(s3_pdf_path, file_content)
  79. if res.get(
  80. "need_drop", False
  81. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  82. jso["need_drop"] = True
  83. jso["drop_reason"] = res["drop_reason"]
  84. else: # 正常返回
  85. jso["pdf_meta"] = res
  86. jso["content"] = ""
  87. jso["remark"] = ""
  88. jso["data_url"] = ""
  89. end_time = time.time() # 记录结束时间
  90. meta_scan_time = int(end_time - start_time) # 计算执行时间
  91. logger.info(
  92. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}",
  93. file=sys.stderr,
  94. )
  95. jso["read_file_time"] = read_file_time
  96. jso["meta_scan_time"] = meta_scan_time
  97. except Exception as e:
  98. jso = exception_handler(jso, e)
  99. return jso
  100. def classify_by_type(jso: dict, debug_mode=False) -> dict:
  101. # 检测debug开关
  102. if debug_mode:
  103. pass
  104. else: # 如果debug没开,则检测是否有needdrop字段
  105. if jso.get("need_drop", False):
  106. return jso
  107. # 开始正式逻辑
  108. try:
  109. pdf_meta = jso.get("pdf_meta")
  110. data_source = get_data_source(jso)
  111. file_id = jso.get("file_id")
  112. book_name = f"{data_source}/{file_id}"
  113. total_page = pdf_meta["total_page"]
  114. page_width = pdf_meta["page_width_pts"]
  115. page_height = pdf_meta["page_height_pts"]
  116. img_sz_list = pdf_meta["image_info_per_page"]
  117. img_num_list = pdf_meta["imgs_per_page"]
  118. text_len_list = pdf_meta["text_len_per_page"]
  119. text_layout_list = pdf_meta["text_layout_per_page"]
  120. text_language = pdf_meta["text_language"]
  121. # allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  122. # if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  123. # jso['need_drop'] = True
  124. # jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  125. # return jso
  126. pdf_path = pdf_meta["pdf_path"]
  127. is_encrypted = pdf_meta["is_encrypted"]
  128. is_needs_password = pdf_meta["is_needs_password"]
  129. if (
  130. is_encrypted or is_needs_password
  131. ): # 加密的,需要密码的,没有页面的,都不处理
  132. jso["need_drop"] = True
  133. jso["drop_reason"] = DropReason.ENCRYPTED
  134. else:
  135. start_time = time.time() # 记录开始时间
  136. is_text_pdf, results = classify(
  137. pdf_path,
  138. total_page,
  139. page_width,
  140. page_height,
  141. img_sz_list,
  142. text_len_list,
  143. img_num_list,
  144. text_layout_list,
  145. )
  146. classify_time = int(time.time() - start_time) # 计算执行时间
  147. if is_text_pdf:
  148. pdf_meta["is_text_pdf"] = is_text_pdf
  149. jso["pdf_meta"] = pdf_meta
  150. jso["classify_time"] = classify_time
  151. # print(json.dumps(pdf_meta, ensure_ascii=False))
  152. allow_language = ["zh", "en"] # 允许的语言,目前只允许简中和英文的
  153. if (
  154. text_language not in allow_language
  155. ): # 如果语言不在允许的语言中,则drop
  156. jso["need_drop"] = True
  157. jso["drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE
  158. return jso
  159. else:
  160. # 先不drop
  161. pdf_meta["is_text_pdf"] = is_text_pdf
  162. jso["pdf_meta"] = pdf_meta
  163. jso["classify_time"] = classify_time
  164. jso["need_drop"] = True
  165. jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
  166. extra_info = {"classify_rules": []}
  167. for condition, result in results.items():
  168. if not result:
  169. extra_info["classify_rules"].append(condition)
  170. jso["extra_info"] = extra_info
  171. except Exception as e:
  172. jso = exception_handler(jso, e)
  173. return jso
  174. def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
  175. if debug_mode:
  176. pass
  177. else: # 如果debug没开,则检测是否有needdrop字段
  178. if jso.get("need_drop", False):
  179. logger.info(
  180. f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
  181. file=sys.stderr,
  182. )
  183. jso["dropped"] = True
  184. return jso
  185. try:
  186. data_source = get_data_source(jso)
  187. file_id = jso.get("file_id")
  188. book_name = f"{data_source}/{file_id}"
  189. title = jso.get("title")
  190. url_encode_title = quote(title, safe="")
  191. if data_source != "scihub":
  192. return jso
  193. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  194. # 将 pdf_intermediate_dict 解压
  195. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  196. i = 0
  197. for page in pdf_intermediate_dict.values():
  198. if page.get("tables"):
  199. if len(page["tables"]) > 0:
  200. j = 0
  201. for table in page["tables"]:
  202. if debug_mode:
  203. image_path = join_path(
  204. "s3://mllm-raw-media/pdf2md_img/",
  205. book_name,
  206. table["image_path"],
  207. )
  208. else:
  209. image_path = join_path(
  210. "s3://mllm-raw-media/pdf2md_img/", table["image_path"]
  211. )
  212. if image_path.endswith(".jpg"):
  213. j += 1
  214. s3_client = get_s3_client(image_path)
  215. bucket_name, bucket_key = parse_bucket_key(image_path)
  216. # 通过s3_client获取图片到内存
  217. image_bytes = s3_client.get_object(
  218. Bucket=bucket_name, Key=bucket_key
  219. )["Body"].read()
  220. # 保存图片到新的位置
  221. if debug_mode:
  222. new_image_path = join_path(
  223. "s3://mllm-raw-media/pdf2md_img/table_new/",
  224. url_encode_title
  225. + "_"
  226. + table["image_path"].lstrip("tables/"),
  227. )
  228. else:
  229. new_image_path = join_path(
  230. "s3://mllm-raw-media/pdf2md_img/table_new/",
  231. url_encode_title + f"_page{i}_{j}.jpg",
  232. )
  233. logger.info(new_image_path, file=sys.stderr)
  234. bucket_name, bucket_key = parse_bucket_key(new_image_path)
  235. s3_client.put_object(
  236. Bucket=bucket_name, Key=bucket_key, Body=image_bytes
  237. )
  238. else:
  239. continue
  240. i += 1
  241. # 把无用的信息清空
  242. jso["doc_layout_result"] = ""
  243. jso["pdf_intermediate_dict"] = ""
  244. jso["pdf_meta"] = ""
  245. except Exception as e:
  246. jso = exception_handler(jso, e)
  247. return jso
  248. def drop_needdrop_pdf(jso: dict) -> dict:
  249. if jso.get("need_drop", False):
  250. logger.info(
  251. f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
  252. file=sys.stderr,
  253. )
  254. jso["dropped"] = True
  255. return jso
  256. def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  257. if debug_mode:
  258. pass
  259. else: # 如果debug没开,则检测是否有needdrop字段
  260. if jso.get("need_drop", False):
  261. book_name = join_path(get_data_source(jso), jso["file_id"])
  262. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  263. jso["dropped"] = True
  264. return jso
  265. try:
  266. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  267. # 将 pdf_intermediate_dict 解压
  268. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  269. # markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
  270. jso["content_list"] = mk_universal_format(pdf_intermediate_dict)
  271. # jso["content"] = markdown_content
  272. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']}")
  273. # 把无用的信息清空
  274. jso["doc_layout_result"] = ""
  275. jso["pdf_intermediate_dict"] = ""
  276. jso["pdf_meta"] = ""
  277. except Exception as e:
  278. jso = exception_handler(jso, e)
  279. return jso
  280. def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  281. # 检测debug开关
  282. if debug_mode:
  283. pass
  284. else: # 如果debug没开,则检测是否有needdrop字段
  285. if jso.get("need_drop", False):
  286. return jso
  287. # 开始正式逻辑
  288. s3_pdf_path = jso.get("file_location")
  289. s3_config = get_s3_config(s3_pdf_path)
  290. model_output_json_list = jso.get("doc_layout_result")
  291. data_source = get_data_source(jso)
  292. file_id = jso.get("file_id")
  293. book_name = f"{data_source}/{file_id}"
  294. # 1.23.22已修复
  295. # if debug_mode:
  296. # pass
  297. # else:
  298. # if book_name == "zlib/zlib_21929367":
  299. # jso['need_drop'] = True
  300. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  301. # return jso
  302. junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
  303. # total_page = jso['pdf_meta']['total_page']
  304. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  305. svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
  306. max_svgs = max(svgs_per_page_list)
  307. if max_svgs > 3000:
  308. jso["need_drop"] = True
  309. jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  310. # elif total_page > 1000:
  311. # jso['need_drop'] = True
  312. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  313. else:
  314. try:
  315. save_path = s3_image_save_path
  316. image_s3_config = get_s3_config(save_path)
  317. start_time = time.time() # 记录开始时间
  318. # 先打印一下book_name和解析开始的时间
  319. logger.info(
  320. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  321. file=sys.stderr,
  322. )
  323. pdf_info_dict = parse_pdf_by_model(
  324. s3_pdf_path,
  325. s3_config,
  326. model_output_json_list,
  327. save_path,
  328. book_name,
  329. pdf_model_profile=None,
  330. image_s3_config=image_s3_config,
  331. start_page_id=start_page_id,
  332. junk_img_bojids=junk_img_bojids,
  333. debug_mode=debug_mode,
  334. )
  335. if pdf_info_dict.get(
  336. "need_drop", False
  337. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  338. jso["need_drop"] = True
  339. jso["drop_reason"] = pdf_info_dict["drop_reason"]
  340. else: # 正常返回,将 pdf_info_dict 压缩并存储
  341. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  342. jso["pdf_intermediate_dict"] = pdf_info_dict
  343. end_time = time.time() # 记录完成时间
  344. parse_time = int(end_time - start_time) # 计算执行时间
  345. # 解析完成后打印一下book_name和耗时
  346. logger.info(
  347. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  348. file=sys.stderr,
  349. )
  350. jso["parse_time"] = parse_time
  351. except Exception as e:
  352. jso = exception_handler(jso, e)
  353. return jso
  354. """
  355. 统一处理逻辑
  356. 1.先调用parse_pdf对文本类pdf进行处理
  357. 2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
  358. """
  359. def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  360. jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  361. jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  362. return jso
  363. # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
  364. def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  365. if not jso.get("need_drop", False):
  366. return jso
  367. else:
  368. jso = ocr_parse_pdf_core(
  369. jso, start_page_id=start_page_id, debug_mode=debug_mode
  370. )
  371. jso["need_drop"] = False
  372. return jso
  373. def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  374. # 检测debug开关
  375. if debug_mode:
  376. pass
  377. else: # 如果debug没开,则检测是否有needdrop字段
  378. if jso.get("need_drop", False):
  379. return jso
  380. jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  381. return jso
  382. def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  383. s3_pdf_path = jso.get("file_location")
  384. s3_config = get_s3_config(s3_pdf_path)
  385. model_output_json_list = jso.get("doc_layout_result")
  386. data_source = get_data_source(jso)
  387. file_id = jso.get("file_id")
  388. book_name = f"{data_source}/{file_id}"
  389. try:
  390. save_path = s3_image_save_path
  391. image_s3_config = get_s3_config(save_path)
  392. start_time = time.time() # 记录开始时间
  393. # 先打印一下book_name和解析开始的时间
  394. logger.info(
  395. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  396. file=sys.stderr,
  397. )
  398. pdf_info_dict = parse_pdf_by_ocr(
  399. s3_pdf_path,
  400. s3_config,
  401. model_output_json_list,
  402. save_path,
  403. book_name,
  404. pdf_model_profile=None,
  405. image_s3_config=image_s3_config,
  406. start_page_id=start_page_id,
  407. debug_mode=debug_mode,
  408. )
  409. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  410. jso["pdf_intermediate_dict"] = pdf_info_dict
  411. end_time = time.time() # 记录完成时间
  412. parse_time = int(end_time - start_time) # 计算执行时间
  413. # 解析完成后打印一下book_name和耗时
  414. logger.info(
  415. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  416. file=sys.stderr,
  417. )
  418. jso["parse_time"] = parse_time
  419. except Exception as e:
  420. jso = exception_handler(jso, e)
  421. return jso
  422. def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  423. if debug_mode:
  424. pass
  425. else: # 如果debug没开,则检测是否有needdrop字段
  426. if jso.get("need_drop", False):
  427. book_name = join_path(get_data_source(jso), jso["file_id"])
  428. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  429. jso["dropped"] = True
  430. return jso
  431. try:
  432. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  433. # 将 pdf_intermediate_dict 解压
  434. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  435. markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
  436. jso["content"] = markdown_content
  437. logger.info(
  438. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  439. file=sys.stderr,
  440. )
  441. # 把无用的信息清空
  442. jso["doc_layout_result"] = ""
  443. jso["pdf_intermediate_dict"] = ""
  444. jso["pdf_meta"] = ""
  445. except Exception as e:
  446. jso = exception_handler(jso, e)
  447. return jso
  448. def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
  449. jso: dict, debug_mode=False
  450. ) -> dict:
  451. if debug_mode:
  452. pass
  453. else: # 如果debug没开,则检测是否有needdrop字段
  454. if jso.get("need_drop", False):
  455. book_name = join_path(get_data_source(jso), jso["file_id"])
  456. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  457. jso["dropped"] = True
  458. return jso
  459. try:
  460. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  461. # 将 pdf_intermediate_dict 解压
  462. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  463. markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  464. jso["content_ocr"] = markdown_content
  465. logger.info(
  466. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  467. file=sys.stderr,
  468. )
  469. # 把无用的信息清空
  470. jso["doc_layout_result"] = ""
  471. jso["pdf_intermediate_dict"] = ""
  472. jso["pdf_meta"] = ""
  473. except Exception as e:
  474. jso = exception_handler(jso, e)
  475. return jso
  476. def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:
  477. if debug_mode:
  478. pass
  479. else: # 如果debug没开,则检测是否有needdrop字段
  480. if jso.get("need_drop", False):
  481. book_name = join_path(get_data_source(jso), jso["file_id"])
  482. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  483. jso["dropped"] = True
  484. return jso
  485. try:
  486. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  487. # 将 pdf_intermediate_dict 解压
  488. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  489. standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
  490. jso["content_list"] = standard_format
  491. logger.info(
  492. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  493. file=sys.stderr,
  494. )
  495. # 把无用的信息清空
  496. jso["doc_layout_result"] = ""
  497. jso["pdf_intermediate_dict"] = ""
  498. jso["pdf_meta"] = ""
  499. except Exception as e:
  500. jso = exception_handler(jso, e)
  501. return jso
  502. def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  503. # 检测debug开关
  504. if debug_mode:
  505. pass
  506. else: # 如果debug没开,则检测是否有needdrop字段
  507. if jso.get("need_drop", False):
  508. return jso
  509. # 开始正式逻辑
  510. s3_pdf_path = jso.get("file_location")
  511. s3_config = get_s3_config(s3_pdf_path)
  512. model_output_json_list = jso.get("doc_layout_result")
  513. data_source = get_data_source(jso)
  514. file_id = jso.get("file_id")
  515. book_name = f"{data_source}/{file_id}"
  516. # 1.23.22已修复
  517. # if debug_mode:
  518. # pass
  519. # else:
  520. # if book_name == "zlib/zlib_21929367":
  521. # jso['need_drop'] = True
  522. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  523. # return jso
  524. junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
  525. # total_page = jso['pdf_meta']['total_page']
  526. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  527. svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
  528. max_svgs = max(svgs_per_page_list)
  529. if max_svgs > 3000:
  530. jso["need_drop"] = True
  531. jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  532. # elif total_page > 1000:
  533. # jso['need_drop'] = True
  534. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  535. else:
  536. try:
  537. save_path = s3_image_save_path
  538. image_s3_config = get_s3_config(save_path)
  539. start_time = time.time() # 记录开始时间
  540. # 先打印一下book_name和解析开始的时间
  541. logger.info(
  542. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  543. file=sys.stderr,
  544. )
  545. pdf_info_dict = parse_pdf_for_train(
  546. s3_pdf_path,
  547. s3_config,
  548. model_output_json_list,
  549. save_path,
  550. book_name,
  551. pdf_model_profile=None,
  552. image_s3_config=image_s3_config,
  553. start_page_id=start_page_id,
  554. junk_img_bojids=junk_img_bojids,
  555. debug_mode=debug_mode,
  556. )
  557. if pdf_info_dict.get(
  558. "need_drop", False
  559. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  560. jso["need_drop"] = True
  561. jso["drop_reason"] = pdf_info_dict["drop_reason"]
  562. else: # 正常返回,将 pdf_info_dict 压缩并存储
  563. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  564. jso["pdf_intermediate_dict"] = pdf_info_dict
  565. jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
  566. end_time = time.time() # 记录完成时间
  567. parse_time = int(end_time - start_time) # 计算执行时间
  568. # 解析完成后打印一下book_name和耗时
  569. logger.info(
  570. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  571. file=sys.stderr,
  572. )
  573. jso["parse_time"] = parse_time
  574. except Exception as e:
  575. jso = exception_handler(jso, e)
  576. return jso
  577. if __name__ == "__main__":
  578. pass