pipeline.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. # coding=utf8
  2. import sys
  3. import time
  4. from urllib.parse import quote
  5. from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown, ocr_mk_nlp_markdown_with_para, \
  6. ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_mm_markdown_with_para, ocr_mk_mm_standard_format, \
  7. make_standard_format_with_para
  8. from magic_pdf.libs.commons import (
  9. read_file,
  10. join_path,
  11. parse_bucket_key,
  12. formatted_time,
  13. s3_image_save_path,
  14. )
  15. from magic_pdf.libs.drop_reason import DropReason
  16. from magic_pdf.libs.json_compressor import JsonCompressor
  17. from magic_pdf.dict2md.mkcontent import mk_nlp_markdown, mk_universal_format
  18. from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
  19. from magic_pdf.filter.pdf_classify_by_type import classify
  20. from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
  21. from loguru import logger
  22. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  23. from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
  24. from magic_pdf.spark.base import exception_handler, get_data_source
  25. from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
  26. from app.common.s3 import get_s3_config, get_s3_client
  27. def get_data_type(jso: dict):
  28. data_type = jso.get("data_type")
  29. if data_type is None:
  30. data_type = jso.get("file_type")
  31. return data_type
  32. def get_bookid(jso: dict):
  33. book_id = jso.get("bookid")
  34. if book_id is None:
  35. book_id = jso.get("original_file_id")
  36. return book_id
  37. def meta_scan(jso: dict, doc_layout_check=True) -> dict:
  38. s3_pdf_path = jso.get("file_location")
  39. s3_config = get_s3_config(s3_pdf_path)
  40. if doc_layout_check:
  41. if (
  42. "doc_layout_result" not in jso
  43. ): # 检测json中是存在模型数据,如果没有则需要跳过该pdf
  44. jso["need_drop"] = True
  45. jso["drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT
  46. return jso
  47. try:
  48. data_source = get_data_source(jso)
  49. file_id = jso.get("file_id")
  50. book_name = f"{data_source}/{file_id}"
  51. # 首页存在超量drawing问题
  52. # special_pdf_list = ['zlib/zlib_21822650']
  53. # if book_name in special_pdf_list:
  54. # jso['need_drop'] = True
  55. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  56. # return jso
  57. start_time = time.time() # 记录开始时间
  58. logger.info(
  59. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  60. file=sys.stderr,
  61. )
  62. file_content = read_file(s3_pdf_path, s3_config)
  63. read_file_time = int(time.time() - start_time) # 计算执行时间
  64. start_time = time.time() # 记录开始时间
  65. res = pdf_meta_scan(s3_pdf_path, file_content)
  66. if res.get(
  67. "need_drop", False
  68. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  69. jso["need_drop"] = True
  70. jso["drop_reason"] = res["drop_reason"]
  71. else: # 正常返回
  72. jso["pdf_meta"] = res
  73. jso["content"] = ""
  74. jso["remark"] = ""
  75. jso["data_url"] = ""
  76. end_time = time.time() # 记录结束时间
  77. meta_scan_time = int(end_time - start_time) # 计算执行时间
  78. logger.info(
  79. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}",
  80. file=sys.stderr,
  81. )
  82. jso["read_file_time"] = read_file_time
  83. jso["meta_scan_time"] = meta_scan_time
  84. except Exception as e:
  85. jso = exception_handler(jso, e)
  86. return jso
  87. def classify_by_type(jso: dict, debug_mode=False) -> dict:
  88. # 检测debug开关
  89. if debug_mode:
  90. pass
  91. else: # 如果debug没开,则检测是否有needdrop字段
  92. if jso.get("need_drop", False):
  93. return jso
  94. # 开始正式逻辑
  95. try:
  96. pdf_meta = jso.get("pdf_meta")
  97. data_source = get_data_source(jso)
  98. file_id = jso.get("file_id")
  99. book_name = f"{data_source}/{file_id}"
  100. total_page = pdf_meta["total_page"]
  101. page_width = pdf_meta["page_width_pts"]
  102. page_height = pdf_meta["page_height_pts"]
  103. img_sz_list = pdf_meta["image_info_per_page"]
  104. img_num_list = pdf_meta["imgs_per_page"]
  105. text_len_list = pdf_meta["text_len_per_page"]
  106. text_layout_list = pdf_meta["text_layout_per_page"]
  107. text_language = pdf_meta["text_language"]
  108. # allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的
  109. # if text_language not in allow_language: # 如果语言不在允许的语言中,则drop
  110. # jso['need_drop'] = True
  111. # jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
  112. # return jso
  113. pdf_path = pdf_meta["pdf_path"]
  114. is_encrypted = pdf_meta["is_encrypted"]
  115. is_needs_password = pdf_meta["is_needs_password"]
  116. if (
  117. is_encrypted or is_needs_password
  118. ): # 加密的,需要密码的,没有页面的,都不处理
  119. jso["need_drop"] = True
  120. jso["drop_reason"] = DropReason.ENCRYPTED
  121. else:
  122. start_time = time.time() # 记录开始时间
  123. is_text_pdf, results = classify(
  124. pdf_path,
  125. total_page,
  126. page_width,
  127. page_height,
  128. img_sz_list,
  129. text_len_list,
  130. img_num_list,
  131. text_layout_list,
  132. )
  133. classify_time = int(time.time() - start_time) # 计算执行时间
  134. if is_text_pdf:
  135. pdf_meta["is_text_pdf"] = is_text_pdf
  136. jso["pdf_meta"] = pdf_meta
  137. jso["classify_time"] = classify_time
  138. # print(json.dumps(pdf_meta, ensure_ascii=False))
  139. allow_language = ["zh", "en"] # 允许的语言,目前只允许简中和英文的
  140. if (
  141. text_language not in allow_language
  142. ): # 如果语言不在允许的语言中,则drop
  143. jso["need_drop"] = True
  144. jso["drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE
  145. return jso
  146. else:
  147. # 先不drop
  148. pdf_meta["is_text_pdf"] = is_text_pdf
  149. jso["pdf_meta"] = pdf_meta
  150. jso["classify_time"] = classify_time
  151. jso["need_drop"] = True
  152. jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
  153. extra_info = {"classify_rules": []}
  154. for condition, result in results.items():
  155. if not result:
  156. extra_info["classify_rules"].append(condition)
  157. jso["extra_info"] = extra_info
  158. except Exception as e:
  159. jso = exception_handler(jso, e)
  160. return jso
  161. def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
  162. if debug_mode:
  163. pass
  164. else: # 如果debug没开,则检测是否有needdrop字段
  165. if jso.get("need_drop", False):
  166. logger.info(
  167. f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
  168. file=sys.stderr,
  169. )
  170. jso["dropped"] = True
  171. return jso
  172. try:
  173. data_source = get_data_source(jso)
  174. file_id = jso.get("file_id")
  175. book_name = f"{data_source}/{file_id}"
  176. title = jso.get("title")
  177. url_encode_title = quote(title, safe="")
  178. if data_source != "scihub":
  179. return jso
  180. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  181. # 将 pdf_intermediate_dict 解压
  182. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  183. i = 0
  184. for page in pdf_intermediate_dict.values():
  185. if page.get("tables"):
  186. if len(page["tables"]) > 0:
  187. j = 0
  188. for table in page["tables"]:
  189. if debug_mode:
  190. image_path = join_path(
  191. "s3://mllm-raw-media/pdf2md_img/",
  192. book_name,
  193. table["image_path"],
  194. )
  195. else:
  196. image_path = join_path(
  197. "s3://mllm-raw-media/pdf2md_img/", table["image_path"]
  198. )
  199. if image_path.endswith(".jpg"):
  200. j += 1
  201. s3_client = get_s3_client(image_path)
  202. bucket_name, bucket_key = parse_bucket_key(image_path)
  203. # 通过s3_client获取图片到内存
  204. image_bytes = s3_client.get_object(
  205. Bucket=bucket_name, Key=bucket_key
  206. )["Body"].read()
  207. # 保存图片到新的位置
  208. if debug_mode:
  209. new_image_path = join_path(
  210. "s3://mllm-raw-media/pdf2md_img/table_new/",
  211. url_encode_title
  212. + "_"
  213. + table["image_path"].lstrip("tables/"),
  214. )
  215. else:
  216. new_image_path = join_path(
  217. "s3://mllm-raw-media/pdf2md_img/table_new/",
  218. url_encode_title + f"_page{i}_{j}.jpg",
  219. )
  220. logger.info(new_image_path, file=sys.stderr)
  221. bucket_name, bucket_key = parse_bucket_key(new_image_path)
  222. s3_client.put_object(
  223. Bucket=bucket_name, Key=bucket_key, Body=image_bytes
  224. )
  225. else:
  226. continue
  227. i += 1
  228. # 把无用的信息清空
  229. jso["doc_layout_result"] = ""
  230. jso["pdf_intermediate_dict"] = ""
  231. jso["pdf_meta"] = ""
  232. except Exception as e:
  233. jso = exception_handler(jso, e)
  234. return jso
  235. def drop_needdrop_pdf(jso: dict) -> dict:
  236. if jso.get("need_drop", False):
  237. logger.info(
  238. f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
  239. file=sys.stderr,
  240. )
  241. jso["dropped"] = True
  242. return jso
  243. def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  244. if debug_mode:
  245. pass
  246. else: # 如果debug没开,则检测是否有needdrop字段
  247. if jso.get("need_drop", False):
  248. book_name = join_path(get_data_source(jso), jso["file_id"])
  249. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  250. jso["dropped"] = True
  251. return jso
  252. try:
  253. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  254. # 将 pdf_intermediate_dict 解压
  255. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  256. # markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
  257. jso["content_list"] = mk_universal_format(pdf_intermediate_dict)
  258. # jso["content"] = markdown_content
  259. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']}")
  260. # 把无用的信息清空
  261. jso["doc_layout_result"] = ""
  262. jso["pdf_intermediate_dict"] = ""
  263. jso["pdf_meta"] = ""
  264. except Exception as e:
  265. jso = exception_handler(jso, e)
  266. return jso
  267. def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  268. # 检测debug开关
  269. if debug_mode:
  270. pass
  271. else: # 如果debug没开,则检测是否有needdrop字段
  272. if jso.get("need_drop", False):
  273. return jso
  274. # 开始正式逻辑
  275. s3_pdf_path = jso.get("file_location")
  276. s3_config = get_s3_config(s3_pdf_path)
  277. model_output_json_list = jso.get("doc_layout_result")
  278. data_source = get_data_source(jso)
  279. file_id = jso.get("file_id")
  280. book_name = f"{data_source}/{file_id}"
  281. # 1.23.22已修复
  282. # if debug_mode:
  283. # pass
  284. # else:
  285. # if book_name == "zlib/zlib_21929367":
  286. # jso['need_drop'] = True
  287. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  288. # return jso
  289. junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
  290. # total_page = jso['pdf_meta']['total_page']
  291. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  292. svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
  293. max_svgs = max(svgs_per_page_list)
  294. if max_svgs > 3000:
  295. jso["need_drop"] = True
  296. jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  297. # elif total_page > 1000:
  298. # jso['need_drop'] = True
  299. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  300. else:
  301. try:
  302. save_path = s3_image_save_path
  303. image_s3_config = get_s3_config(save_path)
  304. start_time = time.time() # 记录开始时间
  305. # 先打印一下book_name和解析开始的时间
  306. logger.info(
  307. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  308. file=sys.stderr,
  309. )
  310. pdf_info_dict = parse_pdf_by_model(
  311. s3_pdf_path,
  312. s3_config,
  313. model_output_json_list,
  314. save_path,
  315. book_name,
  316. pdf_model_profile=None,
  317. image_s3_config=image_s3_config,
  318. start_page_id=start_page_id,
  319. junk_img_bojids=junk_img_bojids,
  320. debug_mode=debug_mode,
  321. )
  322. if pdf_info_dict.get(
  323. "need_drop", False
  324. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  325. jso["need_drop"] = True
  326. jso["drop_reason"] = pdf_info_dict["drop_reason"]
  327. else: # 正常返回,将 pdf_info_dict 压缩并存储
  328. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  329. jso["pdf_intermediate_dict"] = pdf_info_dict
  330. end_time = time.time() # 记录完成时间
  331. parse_time = int(end_time - start_time) # 计算执行时间
  332. # 解析完成后打印一下book_name和耗时
  333. logger.info(
  334. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  335. file=sys.stderr,
  336. )
  337. jso["parse_time"] = parse_time
  338. except Exception as e:
  339. jso = exception_handler(jso, e)
  340. return jso
  341. """
  342. 统一处理逻辑
  343. 1.先调用parse_pdf对文本类pdf进行处理
  344. 2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
  345. """
  346. def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  347. jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  348. jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  349. return jso
  350. def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  351. # 检测debug开关
  352. if debug_mode:
  353. pass
  354. else: # 如果debug没开,则检测是否有needdrop字段
  355. if jso.get("need_drop", False):
  356. return jso
  357. # 开始正式逻辑
  358. s3_pdf_path = jso.get("file_location")
  359. s3_config = get_s3_config(s3_pdf_path)
  360. model_output_json_list = jso.get("doc_layout_result")
  361. data_source = get_data_source(jso)
  362. file_id = jso.get("file_id")
  363. book_name = f"{data_source}/{file_id}"
  364. # 1.23.22已修复
  365. # if debug_mode:
  366. # pass
  367. # else:
  368. # if book_name == "zlib/zlib_21929367":
  369. # jso['need_drop'] = True
  370. # jso['drop_reason'] = DropReason.SPECIAL_PDF
  371. # return jso
  372. junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
  373. # total_page = jso['pdf_meta']['total_page']
  374. # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
  375. svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
  376. max_svgs = max(svgs_per_page_list)
  377. if max_svgs > 3000:
  378. jso["need_drop"] = True
  379. jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
  380. # elif total_page > 1000:
  381. # jso['need_drop'] = True
  382. # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
  383. else:
  384. try:
  385. save_path = s3_image_save_path
  386. image_s3_config = get_s3_config(save_path)
  387. start_time = time.time() # 记录开始时间
  388. # 先打印一下book_name和解析开始的时间
  389. logger.info(
  390. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  391. file=sys.stderr,
  392. )
  393. pdf_info_dict = parse_pdf_for_train(
  394. s3_pdf_path,
  395. s3_config,
  396. model_output_json_list,
  397. save_path,
  398. book_name,
  399. pdf_model_profile=None,
  400. image_s3_config=image_s3_config,
  401. start_page_id=start_page_id,
  402. junk_img_bojids=junk_img_bojids,
  403. debug_mode=debug_mode,
  404. )
  405. if pdf_info_dict.get(
  406. "need_drop", False
  407. ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
  408. jso["need_drop"] = True
  409. jso["drop_reason"] = pdf_info_dict["drop_reason"]
  410. else: # 正常返回,将 pdf_info_dict 压缩并存储
  411. jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
  412. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  413. jso["pdf_intermediate_dict"] = pdf_info_dict
  414. end_time = time.time() # 记录完成时间
  415. parse_time = int(end_time - start_time) # 计算执行时间
  416. # 解析完成后打印一下book_name和耗时
  417. logger.info(
  418. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  419. file=sys.stderr,
  420. )
  421. jso["parse_time"] = parse_time
  422. except Exception as e:
  423. jso = exception_handler(jso, e)
  424. return jso
  425. # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
  426. def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  427. if not jso.get("need_drop", False):
  428. return jso
  429. else:
  430. jso = ocr_parse_pdf_core(
  431. jso, start_page_id=start_page_id, debug_mode=debug_mode
  432. )
  433. jso["need_drop"] = False
  434. return jso
  435. def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  436. # 检测debug开关
  437. if debug_mode:
  438. pass
  439. else: # 如果debug没开,则检测是否有needdrop字段
  440. if jso.get("need_drop", False):
  441. return jso
  442. jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  443. return jso
  444. def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  445. s3_pdf_path = jso.get("file_location")
  446. s3_config = get_s3_config(s3_pdf_path)
  447. model_output_json_list = jso.get("doc_layout_result")
  448. data_source = get_data_source(jso)
  449. file_id = jso.get("file_id")
  450. book_name = f"{data_source}/{file_id}"
  451. try:
  452. save_path = s3_image_save_path
  453. image_s3_config = get_s3_config(save_path)
  454. start_time = time.time() # 记录开始时间
  455. # 先打印一下book_name和解析开始的时间
  456. logger.info(
  457. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  458. file=sys.stderr,
  459. )
  460. pdf_info_dict = parse_pdf_by_ocr(
  461. s3_pdf_path,
  462. s3_config,
  463. model_output_json_list,
  464. save_path,
  465. book_name,
  466. pdf_model_profile=None,
  467. image_s3_config=image_s3_config,
  468. start_page_id=start_page_id,
  469. debug_mode=debug_mode,
  470. )
  471. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  472. jso["pdf_intermediate_dict"] = pdf_info_dict
  473. end_time = time.time() # 记录完成时间
  474. parse_time = int(end_time - start_time) # 计算执行时间
  475. # 解析完成后打印一下book_name和耗时
  476. logger.info(
  477. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  478. file=sys.stderr,
  479. )
  480. jso["parse_time"] = parse_time
  481. except Exception as e:
  482. jso = exception_handler(jso, e)
  483. return jso
  484. def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  485. if debug_mode:
  486. pass
  487. else: # 如果debug没开,则检测是否有needdrop字段
  488. if jso.get("need_drop", False):
  489. book_name = join_path(get_data_source(jso), jso["file_id"])
  490. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  491. jso["dropped"] = True
  492. return jso
  493. try:
  494. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  495. # 将 pdf_intermediate_dict 解压
  496. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  497. markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
  498. jso["content"] = markdown_content
  499. logger.info(
  500. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  501. file=sys.stderr,
  502. )
  503. # 把无用的信息清空
  504. jso["doc_layout_result"] = ""
  505. jso["pdf_intermediate_dict"] = ""
  506. jso["pdf_meta"] = ""
  507. except Exception as e:
  508. jso = exception_handler(jso, e)
  509. return jso
  510. def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:
  511. if debug_mode:
  512. pass
  513. else: # 如果debug没开,则检测是否有needdrop字段
  514. if jso.get("need_drop", False):
  515. book_name = join_path(get_data_source(jso), jso["file_id"])
  516. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  517. jso["dropped"] = True
  518. return jso
  519. try:
  520. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  521. # 将 pdf_intermediate_dict 解压
  522. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  523. # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  524. markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
  525. jso["content"] = markdown_content
  526. logger.info(
  527. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  528. file=sys.stderr,
  529. )
  530. # 把无用的信息清空
  531. jso["doc_layout_result"] = ""
  532. jso["pdf_intermediate_dict"] = ""
  533. jso["pdf_meta"] = ""
  534. except Exception as e:
  535. jso = exception_handler(jso, e)
  536. return jso
  537. def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
  538. if debug_mode:
  539. pass
  540. else: # 如果debug没开,则检测是否有needdrop字段
  541. if jso.get("need_drop", False):
  542. book_name = join_path(get_data_source(jso), jso["file_id"])
  543. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  544. jso["dropped"] = True
  545. return jso
  546. try:
  547. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  548. # 将 pdf_intermediate_dict 解压
  549. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  550. markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
  551. jso["content"] = markdown_content
  552. logger.info(
  553. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  554. file=sys.stderr,
  555. )
  556. # 把无用的信息清空
  557. # jso["doc_layout_result"] = ""
  558. jso["pdf_intermediate_dict"] = ""
  559. # jso["pdf_meta"] = ""
  560. except Exception as e:
  561. jso = exception_handler(jso, e)
  562. return jso
  563. def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
  564. jso: dict, debug_mode=False
  565. ) -> dict:
  566. if debug_mode:
  567. pass
  568. else: # 如果debug没开,则检测是否有needdrop字段
  569. if jso.get("need_drop", False):
  570. book_name = join_path(get_data_source(jso), jso["file_id"])
  571. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  572. jso["dropped"] = True
  573. return jso
  574. try:
  575. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  576. # 将 pdf_intermediate_dict 解压
  577. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  578. markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  579. jso["content_ocr"] = markdown_content
  580. logger.info(
  581. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  582. file=sys.stderr,
  583. )
  584. # 把无用的信息清空
  585. jso["doc_layout_result"] = ""
  586. jso["pdf_intermediate_dict"] = ""
  587. jso["mid_json_ocr"] = pdf_intermediate_dict
  588. jso["pdf_meta"] = ""
  589. except Exception as e:
  590. jso = exception_handler(jso, e)
  591. return jso
  592. def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:
  593. if debug_mode:
  594. pass
  595. else: # 如果debug没开,则检测是否有needdrop字段
  596. if jso.get("need_drop", False):
  597. book_name = join_path(get_data_source(jso), jso["file_id"])
  598. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  599. jso["dropped"] = True
  600. return jso
  601. try:
  602. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  603. # 将 pdf_intermediate_dict 解压
  604. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  605. standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
  606. jso["content_list"] = standard_format
  607. logger.info(
  608. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  609. file=sys.stderr,
  610. )
  611. # 把无用的信息清空
  612. jso["doc_layout_result"] = ""
  613. jso["pdf_intermediate_dict"] = ""
  614. jso["pdf_meta"] = ""
  615. except Exception as e:
  616. jso = exception_handler(jso, e)
  617. return jso
  618. def ocr_pdf_intermediate_dict_to_standard_format_with_para(jso: dict, debug_mode=False) -> dict:
  619. if debug_mode:
  620. pass
  621. else: # 如果debug没开,则检测是否有needdrop字段
  622. if jso.get("need_drop", False):
  623. book_name = join_path(get_data_source(jso), jso["file_id"])
  624. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  625. jso["dropped"] = True
  626. return jso
  627. try:
  628. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  629. # 将 pdf_intermediate_dict 解压
  630. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  631. standard_format = make_standard_format_with_para(pdf_intermediate_dict)
  632. jso["content_list"] = standard_format
  633. logger.info(
  634. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  635. file=sys.stderr,
  636. )
  637. # 把无用的信息清空
  638. jso["doc_layout_result"] = ""
  639. jso["pdf_intermediate_dict"] = ""
  640. jso["pdf_meta"] = ""
  641. except Exception as e:
  642. jso = exception_handler(jso, e)
  643. return jso
  644. if __name__ == "__main__":
  645. pass