equations_replace.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. """
  2. 对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
  3. """
  4. from magic_pdf.libs.commons import fitz
  5. import json
  6. import os
  7. from pathlib import Path
  8. from loguru import logger
  9. from magic_pdf.libs.ocr_content_type import ContentType
  10. TYPE_INLINE_EQUATION = ContentType.InlineEquation
  11. TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
  12. def combine_chars_to_pymudict(block_dict, char_dict):
  13. """
  14. 把block级别的pymupdf 结构里加入char结构
  15. """
  16. # 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
  17. char_map = {tuple(item['bbox']):item for item in char_dict}
  18. for i in range(len(block_dict)): # blcok
  19. block = block_dict[i]
  20. key = block['bbox']
  21. char_dict_item = char_map[tuple(key)]
  22. char_dict_map = {tuple(item['bbox']):item for item in char_dict_item['lines']}
  23. for j in range(len(block['lines'])):
  24. lines = block['lines'][j]
  25. with_char_lines = char_dict_map[lines['bbox']]
  26. for k in range(len(lines['spans'])):
  27. spans = lines['spans'][k]
  28. try:
  29. chars = with_char_lines['spans'][k]['chars']
  30. except Exception as e:
  31. logger.error(char_dict[i]['lines'][j])
  32. spans['chars'] = chars
  33. return block_dict
  34. def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
  35. """
  36. 计算box1和box2的重叠面积占最小面积的box的比例
  37. """
  38. # Determine the coordinates of the intersection rectangle
  39. x_left = max(bbox1[0], min_bbox[0])
  40. y_top = max(bbox1[1], min_bbox[1])
  41. x_right = min(bbox1[2], min_bbox[2])
  42. y_bottom = min(bbox1[3], min_bbox[3])
  43. if x_right < x_left or y_bottom < y_top:
  44. return 0.0
  45. # The area of overlap area
  46. intersection_area = (x_right - x_left) * (y_bottom - y_top)
  47. min_box_area = (min_bbox[3]-min_bbox[1])*(min_bbox[2]-min_bbox[0])
  48. if min_box_area==0:
  49. return 0
  50. else:
  51. return intersection_area / min_box_area
  52. def _is_xin(bbox1, bbox2):
  53. area1 = abs(bbox1[2]-bbox1[0])*abs(bbox1[3]-bbox1[1])
  54. area2 = abs(bbox2[2]-bbox2[0])*abs(bbox2[3]-bbox2[1])
  55. if area1<area2:
  56. ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
  57. else:
  58. ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
  59. return ratio>0.6
  60. def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
  61. """消除掉整个块都在行间公式块内部的文本块"""
  62. for eq_bbox in interline_bboxes:
  63. removed_txt_blk = []
  64. for text_blk in text_blocks:
  65. text_bbox = text_blk['bbox']
  66. if calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)>=0.7:
  67. removed_txt_blk.append(text_blk)
  68. for blk in removed_txt_blk:
  69. text_blocks.remove(blk)
  70. return text_blocks
  71. def _is_in_or_part_overlap(box1, box2) -> bool:
  72. """
  73. 两个bbox是否有部分重叠或者包含
  74. """
  75. if box1 is None or box2 is None:
  76. return False
  77. x0_1, y0_1, x1_1, y1_1 = box1
  78. x0_2, y0_2, x1_2, y1_2 = box2
  79. return not (x1_1 < x0_2 or # box1在box2的左边
  80. x0_1 > x1_2 or # box1在box2的右边
  81. y1_1 < y0_2 or # box1在box2的上边
  82. y0_1 > y1_2) # box1在box2的下边
  83. def remove_text_block_overlap_interline_equation_bbox(interline_eq_bboxes, pymu_block_list):
  84. """消除掉行行内公式有部分重叠的文本块的内容。
  85. 同时重新计算消除重叠之后文本块的大小"""
  86. deleted_block = []
  87. for text_block in pymu_block_list:
  88. deleted_line = []
  89. for line in text_block['lines']:
  90. deleted_span = []
  91. for span in line['spans']:
  92. deleted_chars = []
  93. for char in span['chars']:
  94. if any([_is_in_or_part_overlap(char['bbox'], eq_bbox['bbox']) for eq_bbox in interline_eq_bboxes]):
  95. deleted_chars.append(char)
  96. # 检查span里没有char则删除这个span
  97. for char in deleted_chars:
  98. span['chars'].remove(char)
  99. # 重新计算这个span的大小
  100. if len(span['chars'])==0: # 删除这个span
  101. deleted_span.append(span)
  102. else:
  103. span['bbox'] = min([b['bbox'][0] for b in span['chars']]),min([b['bbox'][1] for b in span['chars']]),max([b['bbox'][2] for b in span['chars']]), max([b['bbox'][3] for b in span['chars']])
  104. # 检查这个span
  105. for span in deleted_span:
  106. line['spans'].remove(span)
  107. if len(line['spans'])==0: #删除这个line
  108. deleted_line.append(line)
  109. else:
  110. line['bbox'] = min([b['bbox'][0] for b in line['spans']]),min([b['bbox'][1] for b in line['spans']]),max([b['bbox'][2] for b in line['spans']]), max([b['bbox'][3] for b in line['spans']])
  111. # 检查这个block是否可以删除
  112. for line in deleted_line:
  113. text_block['lines'].remove(line)
  114. if len(text_block['lines'])==0: # 删除block
  115. deleted_block.append(text_block)
  116. else:
  117. text_block['bbox'] = min([b['bbox'][0] for b in text_block['lines']]),min([b['bbox'][1] for b in text_block['lines']]),max([b['bbox'][2] for b in text_block['lines']]), max([b['bbox'][3] for b in text_block['lines']])
  118. # 检查text block删除
  119. for block in deleted_block:
  120. pymu_block_list.remove(block)
  121. if len(pymu_block_list)==0:
  122. return []
  123. return pymu_block_list
  124. def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
  125. """在行间公式对应的地方插上一个伪造的block"""
  126. for eq in interline_eq_bboxes:
  127. bbox = eq['bbox']
  128. latex_content = eq['latex_text']
  129. text_block = {
  130. "number": len(pymu_block_list),
  131. "type": 0,
  132. "bbox": bbox,
  133. "lines": [
  134. {
  135. "spans": [
  136. {
  137. "size": 9.962599754333496,
  138. "_type": TYPE_INTERLINE_EQUATION,
  139. "flags": 4,
  140. "font": TYPE_INTERLINE_EQUATION,
  141. "color": 0,
  142. "ascender": 0.9409999847412109,
  143. "descender": -0.3050000071525574,
  144. "text": f"\n$$\n{latex_content}\n$$\n",
  145. "origin": [
  146. bbox[0],
  147. bbox[1]
  148. ],
  149. "bbox": bbox
  150. }
  151. ],
  152. "wmode": 0,
  153. "dir": [
  154. 1.0,
  155. 0.0
  156. ],
  157. "bbox": bbox
  158. }
  159. ]
  160. }
  161. pymu_block_list.append(text_block)
  162. def x_overlap_ratio(box1, box2):
  163. a, _, c, _ = box1
  164. e, _, g, _ = box2
  165. # 计算重叠宽度
  166. overlap_x = max(min(c, g) - max(a, e), 0)
  167. # 计算box1的宽度
  168. width1 = g - e
  169. # 计算重叠比例
  170. overlap_ratio = overlap_x / width1 if width1 != 0 else 0
  171. return overlap_ratio
  172. def __is_x_dir_overlap(bbox1, bbox2):
  173. return not (bbox1[2]<bbox2[0] or bbox1[0]>bbox2[2])
  174. def __y_overlap_ratio(box1, box2):
  175. """"""
  176. _, b, _, d = box1
  177. _, f, _, h = box2
  178. # 计算重叠高度
  179. overlap_y = max(min(d, h) - max(b, f), 0)
  180. # 计算box1的高度
  181. height1 = d - b
  182. # 计算重叠比例
  183. overlap_ratio = overlap_y / height1 if height1 != 0 else 0
  184. return overlap_ratio
  185. def replace_line_v2(eqinfo, line):
  186. """
  187. 扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
  188. 最后与这个x0,x1有相交的span0, span1内部进行分割。
  189. """
  190. first_overlap_span = -1
  191. first_overlap_span_idx = -1
  192. last_overlap_span = -1
  193. delete_chars = []
  194. for i in range(0, len(line['spans'])):
  195. if line['spans'][i].get("_type", None) is not None:
  196. continue # 忽略,因为已经是插入的伪造span公式了
  197. for char in line['spans'][i]['chars']:
  198. if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
  199. line_txt = ""
  200. for span in line['spans']:
  201. span_txt = "<span>"
  202. for ch in span['chars']:
  203. span_txt = span_txt + ch['c']
  204. span_txt = span_txt + "</span>"
  205. line_txt = line_txt + span_txt
  206. if first_overlap_span_idx == -1:
  207. first_overlap_span = line['spans'][i]
  208. first_overlap_span_idx = i
  209. last_overlap_span = line['spans'][i]
  210. delete_chars.append(char)
  211. # 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
  212. if len(delete_chars)>0:
  213. ch0_bbox = delete_chars[0]['bbox']
  214. if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
  215. delete_chars.remove(delete_chars[0])
  216. if len(delete_chars)>0:
  217. ch0_bbox = delete_chars[-1]['bbox']
  218. if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
  219. delete_chars.remove(delete_chars[-1])
  220. # 计算x方向上被删除区间内的char的真实x0, x1
  221. if len(delete_chars):
  222. x0, x1 = min([b['bbox'][0] for b in delete_chars]), max([b['bbox'][2] for b in delete_chars])
  223. else:
  224. logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
  225. return False
  226. # 删除位于x0, x1这两个中间的span
  227. delete_span = []
  228. for span in line['spans']:
  229. span_box = span['bbox']
  230. if x0<=span_box[0] and span_box[2]<=x1:
  231. delete_span.append(span)
  232. for span in delete_span:
  233. line['spans'].remove(span)
  234. equation_span = {
  235. "size": 9.962599754333496,
  236. "_type": TYPE_INLINE_EQUATION,
  237. "flags": 4,
  238. "font": TYPE_INLINE_EQUATION,
  239. "color": 0,
  240. "ascender": 0.9409999847412109,
  241. "descender": -0.3050000071525574,
  242. "text": "",
  243. "origin": [
  244. 337.1410153102337,
  245. 216.0205245153934
  246. ],
  247. "bbox": [
  248. 337.1410153102337,
  249. 216.0205245153934,
  250. 390.4496373892022,
  251. 228.50171037628277
  252. ]
  253. }
  254. #equation_span = line['spans'][0].copy()
  255. equation_span['text'] = f" ${eqinfo['latex_text']}$ "
  256. equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]]
  257. equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
  258. equation_span['chars'] = delete_chars
  259. equation_span['_type'] = TYPE_INLINE_EQUATION
  260. equation_span['_eq_bbox'] = eqinfo['bbox']
  261. line['spans'].insert(first_overlap_span_idx+1, equation_span) # 放入公式
  262. # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
  263. # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
  264. first_span_chars = [char for char in first_overlap_span['chars'] if (char['bbox'][2]+char['bbox'][0])/2<x0]
  265. tail_span_chars = [char for char in last_overlap_span['chars'] if (char['bbox'][0]+char['bbox'][2])/2>x1]
  266. if len(first_span_chars)>0:
  267. first_overlap_span['chars'] = first_span_chars
  268. first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
  269. first_overlap_span['bbox'] = (first_overlap_span['bbox'][0], first_overlap_span['bbox'][1], max([chr['bbox'][2] for chr in first_span_chars]), first_overlap_span['bbox'][3])
  270. # first_overlap_span['_type'] = "first"
  271. else:
  272. # 删掉
  273. if first_overlap_span not in delete_span:
  274. line['spans'].remove(first_overlap_span)
  275. if len(tail_span_chars)>0:
  276. if last_overlap_span==first_overlap_span: # 这个时候应该插入一个新的
  277. tail_span_txt = ''.join([char['c'] for char in tail_span_chars])
  278. last_span_to_insert = last_overlap_span.copy()
  279. last_span_to_insert['chars'] = tail_span_chars
  280. last_span_to_insert['text'] = ''.join([char['c'] for char in tail_span_chars])
  281. last_span_to_insert['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
  282. # 插入到公式对象之后
  283. equation_idx = line['spans'].index(equation_span)
  284. line['spans'].insert(equation_idx+1, last_span_to_insert) # 放入公式
  285. else: # 直接修改原来的span
  286. last_overlap_span['chars'] = tail_span_chars
  287. last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
  288. last_overlap_span['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
  289. else:
  290. # 删掉
  291. if last_overlap_span not in delete_span and last_overlap_span!=first_overlap_span:
  292. line['spans'].remove(last_overlap_span)
  293. remain_txt = ""
  294. for span in line['spans']:
  295. span_txt = "<span>"
  296. for char in span['chars']:
  297. span_txt = span_txt + char['c']
  298. span_txt = span_txt + "</span>"
  299. remain_txt = remain_txt + span_txt
  300. # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
  301. return True
  302. def replace_eq_blk(eqinfo, text_block):
  303. """替换行内公式"""
  304. for line in text_block['lines']:
  305. line_bbox = line['bbox']
  306. if _is_xin(eqinfo['bbox'], line_bbox) or __y_overlap_ratio(eqinfo['bbox'], line_bbox)>0.6: # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
  307. replace_succ = replace_line_v2(eqinfo, line)
  308. if not replace_succ: # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
  309. continue
  310. else:
  311. break
  312. else:
  313. return False
  314. return True
  315. def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
  316. """替换行内公式"""
  317. for eqinfo in inline_equation_bboxes:
  318. eqbox = eqinfo['bbox']
  319. for blk in raw_text_blocks:
  320. if _is_xin(eqbox, blk['bbox']):
  321. if not replace_eq_blk(eqinfo, blk):
  322. logger.error(f"行内公式没有替换成功:{eqinfo} ")
  323. else:
  324. break
  325. return raw_text_blocks
  326. def remove_chars_in_text_blocks(text_blocks):
  327. """删除text_blocks里的char"""
  328. for blk in text_blocks:
  329. for line in blk['lines']:
  330. for span in line['spans']:
  331. _ = span.pop("chars", "no such key")
  332. return text_blocks
  333. def replace_equations_in_textblock(raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes):
  334. """
  335. 替换行间和和行内公式为latex
  336. """
  337. raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠:第一步,在公式内部的
  338. raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重,第二步,和公式覆盖的
  339. insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
  340. raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
  341. return raw_text_blocks
  342. def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
  343. """
  344. """
  345. new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
  346. with open(json_path, "r", encoding='utf-8') as f:
  347. obj = json.loads(f.read())
  348. if os.path.exists(new_pdf):
  349. os.remove(new_pdf)
  350. new_doc = fitz.open('')
  351. doc = fitz.open(pdf_path)
  352. new_doc = fitz.open(pdf_path)
  353. for i in range(len(new_doc)):
  354. page = new_doc[i]
  355. inline_equation_bboxes = obj[f"page_{i}"]['inline_equations']
  356. interline_equation_bboxes = obj[f"page_{i}"]['interline_equations']
  357. raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
  358. raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠:第一步,在公式内部的
  359. raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重,第二步,和公式覆盖的
  360. insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
  361. raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
  362. # 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
  363. color_map = [fitz.pdfcolor['blue'],fitz.pdfcolor['green']]
  364. j = 0
  365. for blk in raw_text_blocks:
  366. for i,line in enumerate(blk['lines']):
  367. # line_box = line['bbox']
  368. # shape = page.new_shape()
  369. # shape.draw_rect(line_box)
  370. # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
  371. # shape.commit()
  372. # j = j+1
  373. for i, span in enumerate(line['spans']):
  374. shape_page = page.new_shape()
  375. span_type = span.get('_type')
  376. color = fitz.pdfcolor['blue']
  377. if span_type=='first':
  378. color = fitz.pdfcolor['blue']
  379. elif span_type=='tail':
  380. color = fitz.pdfcolor['green']
  381. elif span_type==TYPE_INLINE_EQUATION:
  382. color = fitz.pdfcolor['black']
  383. else:
  384. color = None
  385. b = span['bbox']
  386. shape_page.draw_rect(b)
  387. shape_page.finish(color=None, fill=color, fill_opacity=0.3)
  388. shape_page.commit()
  389. new_doc.save(new_pdf)
  390. logger.info(f"save ok {new_pdf}")
  391. final_json = json.dumps(obj, ensure_ascii=False,indent=2)
  392. with open("equations_test/final_json.json", "w") as f:
  393. f.write(final_json)
  394. return new_pdf
  395. if __name__=="__main__":
  396. # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
  397. pass