pdf_parse_union_core_v2.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045
  1. import copy
  2. import math
  3. import os
  4. import re
  5. import statistics
  6. import time
  7. import warnings
  8. from typing import List
  9. import cv2
  10. import fitz
  11. import torch
  12. import numpy as np
  13. from loguru import logger
  14. from tqdm import tqdm
  15. from magic_pdf.config.enums import SupportedPdfParseMethod
  16. from magic_pdf.config.ocr_content_type import BlockType, ContentType
  17. from magic_pdf.data.dataset import Dataset, PageableData
  18. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, __is_overlaps_y_exceeds_threshold
  19. from magic_pdf.libs.clean_memory import clean_memory
  20. from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config, get_device
  21. from magic_pdf.libs.convert_utils import dict_to_list
  22. from magic_pdf.libs.hash_utils import compute_md5
  23. from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
  24. from magic_pdf.model.magic_model import MagicModel
  25. from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
  26. from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
  27. from magic_pdf.post_proc.para_split_v3 import para_split
  28. from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
  29. from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
  30. from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
  31. from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
  32. from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
  33. remove_overlaps_min_spans, remove_x_overlapping_chars
  34. os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
  35. def __replace_STX_ETX(text_str: str):
  36. """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
  37. Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
  38. Args:
  39. text_str (str): raw text
  40. Returns:
  41. _type_: replaced text
  42. """ # noqa: E501
  43. if text_str:
  44. s = text_str.replace('\u0002', "'")
  45. s = s.replace('\u0003', "'")
  46. return s
  47. return text_str
  48. # 连写字符拆分
  49. def __replace_ligatures(text: str):
  50. ligatures = {
  51. 'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
  52. }
  53. return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
  54. def chars_to_content(span):
  55. # 检查span中的char是否为空
  56. if len(span['chars']) == 0:
  57. pass
  58. else:
  59. # 先给chars按char['bbox']的中心点的x坐标排序
  60. span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
  61. # Calculate the width of each character
  62. char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
  63. # Calculate the median width
  64. median_width = statistics.median(char_widths)
  65. # 通过x轴重叠比率移除一部分char
  66. span = remove_x_overlapping_chars(span, median_width)
  67. content = ''
  68. for char in span['chars']:
  69. # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
  70. char1 = char
  71. char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
  72. if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
  73. content += f"{char['c']} "
  74. else:
  75. content += char['c']
  76. span['content'] = __replace_ligatures(content)
  77. del span['chars']
  78. LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
  79. LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
  80. def fill_char_in_spans(spans, all_chars):
  81. # 简单从上到下排一下序
  82. spans = sorted(spans, key=lambda x: x['bbox'][1])
  83. for char in all_chars:
  84. for span in spans:
  85. if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
  86. span['chars'].append(char)
  87. break
  88. need_ocr_spans = []
  89. for span in spans:
  90. chars_to_content(span)
  91. # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
  92. if len(span['content']) * span['height'] < span['width'] * 0.5:
  93. # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
  94. need_ocr_spans.append(span)
  95. del span['height'], span['width']
  96. return need_ocr_spans
  97. # 使用鲁棒性更强的中心点坐标判断
  98. def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
  99. char_center_x = (char_bbox[0] + char_bbox[2]) / 2
  100. char_center_y = (char_bbox[1] + char_bbox[3]) / 2
  101. span_center_y = (span_bbox[1] + span_bbox[3]) / 2
  102. span_height = span_bbox[3] - span_bbox[1]
  103. if (
  104. span_bbox[0] < char_center_x < span_bbox[2]
  105. and span_bbox[1] < char_center_y < span_bbox[3]
  106. and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
  107. ):
  108. return True
  109. else:
  110. # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
  111. # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
  112. if char in LINE_STOP_FLAG:
  113. if (
  114. (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
  115. and char_center_x > span_bbox[0]
  116. and span_bbox[1] < char_center_y < span_bbox[3]
  117. and abs(char_center_y - span_center_y) < span_height * span_height_radio
  118. ):
  119. return True
  120. elif char in LINE_START_FLAG:
  121. if (
  122. span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
  123. and char_center_x < span_bbox[2]
  124. and span_bbox[1] < char_center_y < span_bbox[3]
  125. and abs(char_center_y - span_center_y) < span_height * span_height_radio
  126. ):
  127. return True
  128. else:
  129. return False
  130. def remove_tilted_line(text_blocks):
  131. for block in text_blocks:
  132. remove_lines = []
  133. for line in block['lines']:
  134. cosine, sine = line['dir']
  135. # 计算弧度值
  136. angle_radians = math.atan2(sine, cosine)
  137. # 将弧度值转换为角度值
  138. angle_degrees = math.degrees(angle_radians)
  139. if 2 < abs(angle_degrees) < 88:
  140. remove_lines.append(line)
  141. for line in remove_lines:
  142. block['lines'].remove(line)
  143. def calculate_contrast(img, img_mode) -> float:
  144. """
  145. 计算给定图像的对比度。
  146. :param img: 图像,类型为numpy.ndarray
  147. :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
  148. :return: 图像的对比度值
  149. """
  150. if img_mode == 'rgb':
  151. # 将RGB图像转换为灰度图
  152. gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
  153. elif img_mode == 'bgr':
  154. # 将BGR图像转换为灰度图
  155. gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  156. else:
  157. raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
  158. # 计算均值和标准差
  159. mean_value = np.mean(gray_img)
  160. std_dev = np.std(gray_img)
  161. # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
  162. contrast = std_dev / (mean_value + 1e-6)
  163. # logger.debug(f"contrast: {contrast}")
  164. return round(contrast, 2)
  165. # @measure_time
  166. def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
  167. # cid用0xfffd表示,连字符拆开
  168. # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
  169. # cid用0xfffd表示,连字符不拆开
  170. #text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
  171. # 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用
  172. text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
  173. # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
  174. # 移除所有角度不为0或90的line
  175. remove_tilted_line(text_blocks_raw)
  176. all_pymu_chars = []
  177. for block in text_blocks_raw:
  178. for line in block['lines']:
  179. cosine, sine = line['dir']
  180. if abs(cosine) < 0.9 or abs(sine) > 0.1:
  181. continue
  182. for span in line['spans']:
  183. all_pymu_chars.extend(span['chars'])
  184. # 计算所有sapn的高度的中位数
  185. span_height_list = []
  186. for span in spans:
  187. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
  188. continue
  189. span_height = span['bbox'][3] - span['bbox'][1]
  190. span['height'] = span_height
  191. span['width'] = span['bbox'][2] - span['bbox'][0]
  192. span_height_list.append(span_height)
  193. if len(span_height_list) == 0:
  194. return spans
  195. else:
  196. median_span_height = statistics.median(span_height_list)
  197. useful_spans = []
  198. unuseful_spans = []
  199. # 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
  200. vertical_spans = []
  201. for span in spans:
  202. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
  203. continue
  204. for block in all_bboxes + all_discarded_blocks:
  205. if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
  206. continue
  207. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
  208. if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
  209. vertical_spans.append(span)
  210. elif block in all_bboxes:
  211. useful_spans.append(span)
  212. else:
  213. unuseful_spans.append(span)
  214. break
  215. """垂直的span框直接用pymu的line进行填充"""
  216. if len(vertical_spans) > 0:
  217. text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
  218. all_pymu_lines = []
  219. for block in text_blocks:
  220. for line in block['lines']:
  221. all_pymu_lines.append(line)
  222. for pymu_line in all_pymu_lines:
  223. for span in vertical_spans:
  224. if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
  225. for pymu_span in pymu_line['spans']:
  226. span['content'] += pymu_span['text']
  227. break
  228. for span in vertical_spans:
  229. if len(span['content']) == 0:
  230. spans.remove(span)
  231. """水平的span框如果没有char则用ocr进行填充"""
  232. new_spans = []
  233. for span in useful_spans + unuseful_spans:
  234. if span['type'] in [ContentType.Text]:
  235. span['chars'] = []
  236. new_spans.append(span)
  237. need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
  238. if len(need_ocr_spans) > 0:
  239. # 初始化ocr模型
  240. # atom_model_manager = AtomModelSingleton()
  241. # ocr_model = atom_model_manager.get_atom_model(
  242. # atom_model_name='ocr',
  243. # ocr_show_log=False,
  244. # det_db_box_thresh=0.3,
  245. # lang=lang
  246. # )
  247. for span in need_ocr_spans:
  248. # 对span的bbox截图再ocr
  249. span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
  250. # 计算span的对比度,低于0.20的span不进行ocr
  251. if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
  252. spans.remove(span)
  253. continue
  254. # pass
  255. span['content'] = ''
  256. span['score'] = 1
  257. span['np_img'] = span_img
  258. # ocr_res = ocr_model.ocr(span_img, det=False)
  259. # if ocr_res and len(ocr_res) > 0:
  260. # if len(ocr_res[0]) > 0:
  261. # ocr_text, ocr_score = ocr_res[0][0]
  262. # # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
  263. # if ocr_score > 0.5 and len(ocr_text) > 0:
  264. # span['content'] = ocr_text
  265. # span['score'] = float(round(ocr_score, 2))
  266. # else:
  267. # spans.remove(span)
  268. return spans
  269. def model_init(model_name: str):
  270. from transformers import LayoutLMv3ForTokenClassification
  271. device_name = get_device()
  272. bf_16_support = False
  273. if device_name.startswith("cuda"):
  274. bf_16_support = torch.cuda.is_bf16_supported()
  275. elif device_name.startswith("mps"):
  276. bf_16_support = True
  277. device = torch.device(device_name)
  278. if model_name == 'layoutreader':
  279. # 检测modelscope的缓存目录是否存在
  280. layoutreader_model_dir = get_local_layoutreader_model_dir()
  281. if os.path.exists(layoutreader_model_dir):
  282. model = LayoutLMv3ForTokenClassification.from_pretrained(
  283. layoutreader_model_dir
  284. )
  285. else:
  286. logger.warning(
  287. 'local layoutreader model not exists, use online model from huggingface'
  288. )
  289. model = LayoutLMv3ForTokenClassification.from_pretrained(
  290. 'hantian/layoutreader'
  291. )
  292. if bf_16_support:
  293. model.to(device).eval().bfloat16()
  294. else:
  295. model.to(device).eval()
  296. else:
  297. logger.error('model name not allow')
  298. exit(1)
  299. return model
  300. class ModelSingleton:
  301. _instance = None
  302. _models = {}
  303. def __new__(cls, *args, **kwargs):
  304. if cls._instance is None:
  305. cls._instance = super().__new__(cls)
  306. return cls._instance
  307. def get_model(self, model_name: str):
  308. if model_name not in self._models:
  309. self._models[model_name] = model_init(model_name=model_name)
  310. return self._models[model_name]
  311. def do_predict(boxes: List[List[int]], model) -> List[int]:
  312. from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
  313. boxes2inputs, parse_logits, prepare_inputs)
  314. with warnings.catch_warnings():
  315. warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
  316. inputs = boxes2inputs(boxes)
  317. inputs = prepare_inputs(inputs, model)
  318. logits = model(**inputs).logits.cpu().squeeze(0)
  319. return parse_logits(logits, len(boxes))
  320. def cal_block_index(fix_blocks, sorted_bboxes):
  321. if sorted_bboxes is not None:
  322. # 使用layoutreader排序
  323. for block in fix_blocks:
  324. line_index_list = []
  325. if len(block['lines']) == 0:
  326. block['index'] = sorted_bboxes.index(block['bbox'])
  327. else:
  328. for line in block['lines']:
  329. line['index'] = sorted_bboxes.index(line['bbox'])
  330. line_index_list.append(line['index'])
  331. median_value = statistics.median(line_index_list)
  332. block['index'] = median_value
  333. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  334. if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
  335. if 'real_lines' in block:
  336. block['virtual_lines'] = copy.deepcopy(block['lines'])
  337. block['lines'] = copy.deepcopy(block['real_lines'])
  338. del block['real_lines']
  339. else:
  340. # 使用xycut排序
  341. block_bboxes = []
  342. for block in fix_blocks:
  343. # 如果block['bbox']任意值小于0,将其置为0
  344. block['bbox'] = [max(0, x) for x in block['bbox']]
  345. block_bboxes.append(block['bbox'])
  346. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  347. if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
  348. if 'real_lines' in block:
  349. block['virtual_lines'] = copy.deepcopy(block['lines'])
  350. block['lines'] = copy.deepcopy(block['real_lines'])
  351. del block['real_lines']
  352. import numpy as np
  353. from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
  354. recursive_xy_cut
  355. random_boxes = np.array(block_bboxes)
  356. np.random.shuffle(random_boxes)
  357. res = []
  358. recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
  359. assert len(res) == len(block_bboxes)
  360. sorted_boxes = random_boxes[np.array(res)].tolist()
  361. for i, block in enumerate(fix_blocks):
  362. block['index'] = sorted_boxes.index(block['bbox'])
  363. # 生成line index
  364. sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
  365. line_inedx = 1
  366. for block in sorted_blocks:
  367. for line in block['lines']:
  368. line['index'] = line_inedx
  369. line_inedx += 1
  370. return fix_blocks
  371. def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
  372. # block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
  373. x0, y0, x1, y1 = block_bbox
  374. block_height = y1 - y0
  375. block_weight = x1 - x0
  376. # 如果block高度小于n行正文,则直接返回block的bbox
  377. if line_height * 2 < block_height:
  378. if (
  379. block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
  380. ): # 可能是双列结构,可以切细点
  381. lines = int(block_height / line_height)
  382. else:
  383. # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
  384. if block_weight > page_w * 0.4:
  385. lines = 3
  386. elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
  387. lines = int(block_height / line_height)
  388. else: # 判断长宽比
  389. if block_height / block_weight > 1.2: # 细长的不分
  390. return [[x0, y0, x1, y1]]
  391. else: # 不细长的还是分成两行
  392. lines = 2
  393. line_height = (y1 - y0) / lines
  394. # 确定从哪个y位置开始绘制线条
  395. current_y = y0
  396. # 用于存储线条的位置信息[(x0, y), ...]
  397. lines_positions = []
  398. for i in range(lines):
  399. lines_positions.append([x0, current_y, x1, current_y + line_height])
  400. current_y += line_height
  401. return lines_positions
  402. else:
  403. return [[x0, y0, x1, y1]]
  404. def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
  405. page_line_list = []
  406. def add_lines_to_block(b):
  407. line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h)
  408. b['lines'] = []
  409. for line_bbox in line_bboxes:
  410. b['lines'].append({'bbox': line_bbox, 'spans': []})
  411. page_line_list.extend(line_bboxes)
  412. for block in fix_blocks:
  413. if block['type'] in [
  414. BlockType.Text, BlockType.Title,
  415. BlockType.ImageCaption, BlockType.ImageFootnote,
  416. BlockType.TableCaption, BlockType.TableFootnote
  417. ]:
  418. if len(block['lines']) == 0:
  419. add_lines_to_block(block)
  420. elif block['type'] in [BlockType.Title] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2:
  421. block['real_lines'] = copy.deepcopy(block['lines'])
  422. add_lines_to_block(block)
  423. else:
  424. for line in block['lines']:
  425. bbox = line['bbox']
  426. page_line_list.append(bbox)
  427. elif block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
  428. block['real_lines'] = copy.deepcopy(block['lines'])
  429. add_lines_to_block(block)
  430. if len(page_line_list) > 200: # layoutreader最高支持512line
  431. return None
  432. # 使用layoutreader排序
  433. x_scale = 1000.0 / page_w
  434. y_scale = 1000.0 / page_h
  435. boxes = []
  436. # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
  437. for left, top, right, bottom in page_line_list:
  438. if left < 0:
  439. logger.warning(
  440. f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  441. ) # noqa: E501
  442. left = 0
  443. if right > page_w:
  444. logger.warning(
  445. f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  446. ) # noqa: E501
  447. right = page_w
  448. if top < 0:
  449. logger.warning(
  450. f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  451. ) # noqa: E501
  452. top = 0
  453. if bottom > page_h:
  454. logger.warning(
  455. f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  456. ) # noqa: E501
  457. bottom = page_h
  458. left = round(left * x_scale)
  459. top = round(top * y_scale)
  460. right = round(right * x_scale)
  461. bottom = round(bottom * y_scale)
  462. assert (
  463. 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
  464. ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121
  465. boxes.append([left, top, right, bottom])
  466. model_manager = ModelSingleton()
  467. model = model_manager.get_model('layoutreader')
  468. with torch.no_grad():
  469. orders = do_predict(boxes, model)
  470. sorted_bboxes = [page_line_list[i] for i in orders]
  471. return sorted_bboxes
  472. def get_line_height(blocks):
  473. page_line_height_list = []
  474. for block in blocks:
  475. if block['type'] in [
  476. BlockType.Text, BlockType.Title,
  477. BlockType.ImageCaption, BlockType.ImageFootnote,
  478. BlockType.TableCaption, BlockType.TableFootnote
  479. ]:
  480. for line in block['lines']:
  481. bbox = line['bbox']
  482. page_line_height_list.append(int(bbox[3] - bbox[1]))
  483. if len(page_line_height_list) > 0:
  484. return statistics.median(page_line_height_list)
  485. else:
  486. return 10
  487. def process_groups(groups, body_key, caption_key, footnote_key):
  488. body_blocks = []
  489. caption_blocks = []
  490. footnote_blocks = []
  491. for i, group in enumerate(groups):
  492. group[body_key]['group_id'] = i
  493. body_blocks.append(group[body_key])
  494. for caption_block in group[caption_key]:
  495. caption_block['group_id'] = i
  496. caption_blocks.append(caption_block)
  497. for footnote_block in group[footnote_key]:
  498. footnote_block['group_id'] = i
  499. footnote_blocks.append(footnote_block)
  500. return body_blocks, caption_blocks, footnote_blocks
  501. def process_block_list(blocks, body_type, block_type):
  502. indices = [block['index'] for block in blocks]
  503. median_index = statistics.median(indices)
  504. body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
  505. return {
  506. 'type': block_type,
  507. 'bbox': body_bbox,
  508. 'blocks': blocks,
  509. 'index': median_index,
  510. }
  511. def revert_group_blocks(blocks):
  512. image_groups = {}
  513. table_groups = {}
  514. new_blocks = []
  515. for block in blocks:
  516. if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]:
  517. group_id = block['group_id']
  518. if group_id not in image_groups:
  519. image_groups[group_id] = []
  520. image_groups[group_id].append(block)
  521. elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]:
  522. group_id = block['group_id']
  523. if group_id not in table_groups:
  524. table_groups[group_id] = []
  525. table_groups[group_id].append(block)
  526. else:
  527. new_blocks.append(block)
  528. for group_id, blocks in image_groups.items():
  529. new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image))
  530. for group_id, blocks in table_groups.items():
  531. new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table))
  532. return new_blocks
  533. def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
  534. def get_block_bboxes(blocks, block_type_list):
  535. return [block[0:4] for block in blocks if block[7] in block_type_list]
  536. image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
  537. table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
  538. other_block_type = []
  539. for block_type in BlockType.__dict__.values():
  540. if not isinstance(block_type, str):
  541. continue
  542. if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
  543. other_block_type.append(block_type)
  544. other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
  545. discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
  546. new_spans = []
  547. for span in spans:
  548. span_bbox = span['bbox']
  549. span_type = span['type']
  550. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
  551. discarded_block_bboxes):
  552. new_spans.append(span)
  553. continue
  554. if span_type == ContentType.Image:
  555. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  556. image_bboxes):
  557. new_spans.append(span)
  558. elif span_type == ContentType.Table:
  559. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  560. table_bboxes):
  561. new_spans.append(span)
  562. else:
  563. if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
  564. other_block_bboxes):
  565. new_spans.append(span)
  566. return new_spans
  567. def parse_page_core(
  568. page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
  569. ):
  570. need_drop = False
  571. drop_reason = []
  572. """从magic_model对象中获取后面会用到的区块信息"""
  573. img_groups = magic_model.get_imgs_v2(page_id)
  574. table_groups = magic_model.get_tables_v2(page_id)
  575. """对image和table的区块分组"""
  576. img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
  577. img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
  578. )
  579. table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
  580. table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
  581. )
  582. discarded_blocks = magic_model.get_discarded(page_id)
  583. text_blocks = magic_model.get_text_blocks(page_id)
  584. title_blocks = magic_model.get_title_blocks(page_id)
  585. inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
  586. page_w, page_h = magic_model.get_page_size(page_id)
  587. def merge_title_blocks(blocks, x_distance_threshold=0.1*page_w):
  588. def merge_two_bbox(b1, b2):
  589. x_min = min(b1['bbox'][0], b2['bbox'][0])
  590. y_min = min(b1['bbox'][1], b2['bbox'][1])
  591. x_max = max(b1['bbox'][2], b2['bbox'][2])
  592. y_max = max(b1['bbox'][3], b2['bbox'][3])
  593. return x_min, y_min, x_max, y_max
  594. def merge_two_blocks(b1, b2):
  595. # 合并两个标题块的边界框
  596. b1['bbox'] = merge_two_bbox(b1, b2)
  597. # 合并两个标题块的文本内容
  598. line1 = b1['lines'][0]
  599. line2 = b2['lines'][0]
  600. line1['bbox'] = merge_two_bbox(line1, line2)
  601. line1['spans'].extend(line2['spans'])
  602. return b1, b2
  603. # 按 y 轴重叠度聚集标题块
  604. y_overlapping_blocks = []
  605. title_bs = [b for b in blocks if b['type'] == BlockType.Title]
  606. while title_bs:
  607. block1 = title_bs.pop(0)
  608. current_row = [block1]
  609. to_remove = []
  610. for block2 in title_bs:
  611. if (
  612. __is_overlaps_y_exceeds_threshold(block1['bbox'], block2['bbox'], 0.9)
  613. and len(block1['lines']) == 1
  614. and len(block2['lines']) == 1
  615. ):
  616. current_row.append(block2)
  617. to_remove.append(block2)
  618. for b in to_remove:
  619. title_bs.remove(b)
  620. y_overlapping_blocks.append(current_row)
  621. # 按x轴坐标排序并合并标题块
  622. to_remove_blocks = []
  623. for row in y_overlapping_blocks:
  624. if len(row) == 1:
  625. continue
  626. # 按x轴坐标排序
  627. row.sort(key=lambda x: x['bbox'][0])
  628. merged_block = row[0]
  629. for i in range(1, len(row)):
  630. left_block = merged_block
  631. right_block = row[i]
  632. left_height = left_block['bbox'][3] - left_block['bbox'][1]
  633. right_height = right_block['bbox'][3] - right_block['bbox'][1]
  634. if (
  635. right_block['bbox'][0] - left_block['bbox'][2] < x_distance_threshold
  636. and left_height * 0.95 < right_height < left_height * 1.05
  637. ):
  638. merged_block, to_remove_block = merge_two_blocks(merged_block, right_block)
  639. to_remove_blocks.append(to_remove_block)
  640. else:
  641. merged_block = right_block
  642. for b in to_remove_blocks:
  643. blocks.remove(b)
  644. """将所有区块的bbox整理到一起"""
  645. # interline_equation_blocks参数不够准,后面切换到interline_equations上
  646. interline_equation_blocks = []
  647. if len(interline_equation_blocks) > 0:
  648. all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
  649. img_body_blocks, img_caption_blocks, img_footnote_blocks,
  650. table_body_blocks, table_caption_blocks, table_footnote_blocks,
  651. discarded_blocks,
  652. text_blocks,
  653. title_blocks,
  654. interline_equation_blocks,
  655. page_w,
  656. page_h,
  657. )
  658. else:
  659. all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
  660. img_body_blocks, img_caption_blocks, img_footnote_blocks,
  661. table_body_blocks, table_caption_blocks, table_footnote_blocks,
  662. discarded_blocks,
  663. text_blocks,
  664. title_blocks,
  665. interline_equations,
  666. page_w,
  667. page_h,
  668. )
  669. """获取所有的spans信息"""
  670. spans = magic_model.get_all_spans(page_id)
  671. """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
  672. """顺便删除大水印并保留abandon的span"""
  673. spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
  674. """删除重叠spans中置信度较低的那些"""
  675. spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
  676. """删除重叠spans中较小的那些"""
  677. spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
  678. """根据parse_mode,构造spans,主要是文本类的字符填充"""
  679. if parse_mode == SupportedPdfParseMethod.TXT:
  680. """使用新版本的混合ocr方案."""
  681. spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
  682. elif parse_mode == SupportedPdfParseMethod.OCR:
  683. pass
  684. else:
  685. raise Exception('parse_mode must be txt or ocr')
  686. """先处理不需要排版的discarded_blocks"""
  687. discarded_block_with_spans, spans = fill_spans_in_blocks(
  688. all_discarded_blocks, spans, 0.4
  689. )
  690. fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
  691. """如果当前页面没有有效的bbox则跳过"""
  692. if len(all_bboxes) == 0:
  693. logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
  694. return ocr_construct_page_component_v2(
  695. [],
  696. [],
  697. page_id,
  698. page_w,
  699. page_h,
  700. [],
  701. [],
  702. [],
  703. interline_equations,
  704. fix_discarded_blocks,
  705. need_drop,
  706. drop_reason,
  707. )
  708. """对image和table截图"""
  709. spans = ocr_cut_image_and_table(
  710. spans, page_doc, page_id, pdf_bytes_md5, imageWriter
  711. )
  712. """span填充进block"""
  713. block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
  714. """对block进行fix操作"""
  715. fix_blocks = fix_block_spans_v2(block_with_spans)
  716. """同一行被断开的titile合并"""
  717. merge_title_blocks(fix_blocks)
  718. """获取所有line并计算正文line的高度"""
  719. line_height = get_line_height(fix_blocks)
  720. """获取所有line并对line排序"""
  721. sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
  722. """根据line的中位数算block的序列关系"""
  723. fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
  724. """将image和table的block还原回group形式参与后续流程"""
  725. fix_blocks = revert_group_blocks(fix_blocks)
  726. """重排block"""
  727. sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
  728. """block内重排(img和table的block内多个caption或footnote的排序)"""
  729. for block in sorted_blocks:
  730. if block['type'] in [BlockType.Image, BlockType.Table]:
  731. block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
  732. """获取QA需要外置的list"""
  733. images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
  734. """构造pdf_info_dict"""
  735. page_info = ocr_construct_page_component_v2(
  736. sorted_blocks,
  737. [],
  738. page_id,
  739. page_w,
  740. page_h,
  741. [],
  742. images,
  743. tables,
  744. interline_equations,
  745. fix_discarded_blocks,
  746. need_drop,
  747. drop_reason,
  748. )
  749. return page_info
  750. def pdf_parse_union(
  751. model_list,
  752. dataset: Dataset,
  753. imageWriter,
  754. parse_mode,
  755. start_page_id=0,
  756. end_page_id=None,
  757. debug_mode=False,
  758. lang=None,
  759. ):
  760. pdf_bytes_md5 = compute_md5(dataset.data_bits())
  761. """初始化空的pdf_info_dict"""
  762. pdf_info_dict = {}
  763. """用model_list和docs对象初始化magic_model"""
  764. magic_model = MagicModel(model_list, dataset)
  765. """根据输入的起始范围解析pdf"""
  766. end_page_id = (
  767. end_page_id
  768. if end_page_id is not None and end_page_id >= 0
  769. else len(dataset) - 1
  770. )
  771. if end_page_id > len(dataset) - 1:
  772. logger.warning('end_page_id is out of range, use pdf_docs length')
  773. end_page_id = len(dataset) - 1
  774. # """初始化启动时间"""
  775. # start_time = time.time()
  776. # for page_id, page in enumerate(dataset):
  777. for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"):
  778. # """debug时输出每页解析的耗时."""
  779. # if debug_mode:
  780. # time_now = time.time()
  781. # logger.info(
  782. # f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
  783. # )
  784. # start_time = time_now
  785. """解析pdf中的每一页"""
  786. if start_page_id <= page_id <= end_page_id:
  787. page_info = parse_page_core(
  788. page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
  789. )
  790. else:
  791. page_info = page.get_page_info()
  792. page_w = page_info.w
  793. page_h = page_info.h
  794. page_info = ocr_construct_page_component_v2(
  795. [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
  796. )
  797. pdf_info_dict[f'page_{page_id}'] = page_info
  798. need_ocr_list = []
  799. img_crop_list = []
  800. text_block_list = []
  801. for pange_id, page_info in pdf_info_dict.items():
  802. for block in page_info['preproc_blocks']:
  803. if block['type'] in ['table', 'image']:
  804. for sub_block in block['blocks']:
  805. if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
  806. text_block_list.append(sub_block)
  807. elif block['type'] in ['text', 'title']:
  808. text_block_list.append(block)
  809. for block in page_info['discarded_blocks']:
  810. text_block_list.append(block)
  811. for block in text_block_list:
  812. for line in block['lines']:
  813. for span in line['spans']:
  814. if 'np_img' in span:
  815. need_ocr_list.append(span)
  816. img_crop_list.append(span['np_img'])
  817. span.pop('np_img')
  818. if len(img_crop_list) > 0:
  819. # Get OCR results for this language's images
  820. atom_model_manager = AtomModelSingleton()
  821. ocr_model = atom_model_manager.get_atom_model(
  822. atom_model_name='ocr',
  823. ocr_show_log=False,
  824. det_db_box_thresh=0.3,
  825. lang=lang
  826. )
  827. # rec_start = time.time()
  828. ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
  829. # Verify we have matching counts
  830. assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
  831. # Process OCR results for this language
  832. for index, span in enumerate(need_ocr_list):
  833. ocr_text, ocr_score = ocr_res_list[index]
  834. span['content'] = ocr_text
  835. span['score'] = float(round(ocr_score, 2))
  836. # rec_time = time.time() - rec_start
  837. # logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
  838. """分段"""
  839. para_split(pdf_info_dict)
  840. """llm优化"""
  841. llm_aided_config = get_llm_aided_config()
  842. if llm_aided_config is not None:
  843. """公式优化"""
  844. formula_aided_config = llm_aided_config.get('formula_aided', None)
  845. if formula_aided_config is not None:
  846. if formula_aided_config.get('enable', False):
  847. llm_aided_formula_start_time = time.time()
  848. llm_aided_formula(pdf_info_dict, formula_aided_config)
  849. logger.info(f'llm aided formula time: {round(time.time() - llm_aided_formula_start_time, 2)}')
  850. """文本优化"""
  851. text_aided_config = llm_aided_config.get('text_aided', None)
  852. if text_aided_config is not None:
  853. if text_aided_config.get('enable', False):
  854. llm_aided_text_start_time = time.time()
  855. llm_aided_text(pdf_info_dict, text_aided_config)
  856. logger.info(f'llm aided text time: {round(time.time() - llm_aided_text_start_time, 2)}')
  857. """标题优化"""
  858. title_aided_config = llm_aided_config.get('title_aided', None)
  859. if title_aided_config is not None:
  860. if title_aided_config.get('enable', False):
  861. llm_aided_title_start_time = time.time()
  862. llm_aided_title(pdf_info_dict, title_aided_config)
  863. logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
  864. """dict转list"""
  865. pdf_info_list = dict_to_list(pdf_info_dict)
  866. new_pdf_info_dict = {
  867. 'pdf_info': pdf_info_list,
  868. }
  869. clean_memory(get_device())
  870. return new_pdf_info_dict
  871. if __name__ == '__main__':
  872. pass