utils.py 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. __all__ = [
  15. "get_sub_regions_ocr_res",
  16. "get_layout_ordering",
  17. "get_single_block_parsing_res",
  18. "get_show_color",
  19. "sorted_layout_boxes",
  20. ]
  21. import numpy as np
  22. from PIL import Image
  23. import uuid
  24. import re
  25. from pathlib import Path
  26. from copy import deepcopy
  27. from typing import Optional, Union, List, Tuple, Dict, Any
  28. from ..ocr.result import OCRResult
  29. from ...models.object_detection.result import DetResult
  30. from ..components import convert_points_to_boxes
  31. def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
  32. """
  33. Get the indices of source boxes that overlap with reference boxes based on a specified threshold.
  34. Args:
  35. src_boxes (np.ndarray): A 2D numpy array of source bounding boxes.
  36. ref_boxes (np.ndarray): A 2D numpy array of reference bounding boxes.
  37. Returns:
  38. match_idx_list (list): A list of indices of source boxes that overlap with reference boxes.
  39. """
  40. match_idx_list = []
  41. src_boxes_num = len(src_boxes)
  42. if src_boxes_num > 0 and len(ref_boxes) > 0:
  43. for rno in range(len(ref_boxes)):
  44. ref_box = ref_boxes[rno]
  45. x1 = np.maximum(ref_box[0], src_boxes[:, 0])
  46. y1 = np.maximum(ref_box[1], src_boxes[:, 1])
  47. x2 = np.minimum(ref_box[2], src_boxes[:, 2])
  48. y2 = np.minimum(ref_box[3], src_boxes[:, 3])
  49. pub_w = x2 - x1
  50. pub_h = y2 - y1
  51. match_idx = np.where((pub_w > 3) & (pub_h > 3))[0]
  52. match_idx_list.extend(match_idx)
  53. return match_idx_list
  54. def get_sub_regions_ocr_res(
  55. overall_ocr_res: OCRResult,
  56. object_boxes: List,
  57. flag_within: bool = True,
  58. return_match_idx: bool = False,
  59. ) -> OCRResult:
  60. """
  61. Filters OCR results to only include text boxes within specified object boxes based on a flag.
  62. Args:
  63. overall_ocr_res (OCRResult): The original OCR result containing all text boxes.
  64. object_boxes (list): A list of bounding boxes for the objects of interest.
  65. flag_within (bool): If True, only include text boxes within the object boxes. If False, exclude text boxes within the object boxes.
  66. return_match_idx (bool): If True, return the list of matching indices.
  67. Returns:
  68. OCRResult: A filtered OCR result containing only the relevant text boxes.
  69. """
  70. sub_regions_ocr_res = {}
  71. sub_regions_ocr_res["rec_polys"] = []
  72. sub_regions_ocr_res["rec_texts"] = []
  73. sub_regions_ocr_res["rec_scores"] = []
  74. sub_regions_ocr_res["rec_boxes"] = []
  75. overall_text_boxes = overall_ocr_res["rec_boxes"]
  76. match_idx_list = get_overlap_boxes_idx(overall_text_boxes, object_boxes)
  77. match_idx_list = list(set(match_idx_list))
  78. for box_no in range(len(overall_text_boxes)):
  79. if flag_within:
  80. if box_no in match_idx_list:
  81. flag_match = True
  82. else:
  83. flag_match = False
  84. else:
  85. if box_no not in match_idx_list:
  86. flag_match = True
  87. else:
  88. flag_match = False
  89. if flag_match:
  90. sub_regions_ocr_res["rec_polys"].append(
  91. overall_ocr_res["rec_polys"][box_no]
  92. )
  93. sub_regions_ocr_res["rec_texts"].append(
  94. overall_ocr_res["rec_texts"][box_no]
  95. )
  96. sub_regions_ocr_res["rec_scores"].append(
  97. overall_ocr_res["rec_scores"][box_no]
  98. )
  99. sub_regions_ocr_res["rec_boxes"].append(
  100. overall_ocr_res["rec_boxes"][box_no]
  101. )
  102. for key in ["rec_polys", "rec_scores", "rec_boxes"]:
  103. sub_regions_ocr_res[key] = np.array(sub_regions_ocr_res[key])
  104. return (
  105. (sub_regions_ocr_res, match_idx_list)
  106. if return_match_idx
  107. else sub_regions_ocr_res
  108. )
  109. def sorted_layout_boxes(res, w):
  110. """
  111. Sort text boxes in order from top to bottom, left to right
  112. Args:
  113. res: List of dictionaries containing layout information.
  114. w: Width of image.
  115. Returns:
  116. List of dictionaries containing sorted layout information.
  117. """
  118. num_boxes = len(res)
  119. if num_boxes == 1:
  120. return res
  121. # Sort on the y axis first or sort it on the x axis
  122. sorted_boxes = sorted(res, key=lambda x: (x["block_bbox"][1], x["block_bbox"][0]))
  123. _boxes = list(sorted_boxes)
  124. new_res = []
  125. res_left = []
  126. res_right = []
  127. i = 0
  128. while True:
  129. if i >= num_boxes:
  130. break
  131. # Check that the bbox is on the left
  132. elif (
  133. _boxes[i]["block_bbox"][0] < w / 4
  134. and _boxes[i]["block_bbox"][2] < 3 * w / 5
  135. ):
  136. res_left.append(_boxes[i])
  137. i += 1
  138. elif _boxes[i]["block_bbox"][0] > 2 * w / 5:
  139. res_right.append(_boxes[i])
  140. i += 1
  141. else:
  142. new_res += res_left
  143. new_res += res_right
  144. new_res.append(_boxes[i])
  145. res_left = []
  146. res_right = []
  147. i += 1
  148. res_left = sorted(res_left, key=lambda x: (x["block_bbox"][1]))
  149. res_right = sorted(res_right, key=lambda x: (x["block_bbox"][1]))
  150. if res_left:
  151. new_res += res_left
  152. if res_right:
  153. new_res += res_right
  154. return new_res
  155. def _calculate_overlap_area_div_minbox_area_ratio(
  156. bbox1: Union[list, tuple],
  157. bbox2: Union[list, tuple],
  158. ) -> float:
  159. """
  160. Calculate the ratio of the overlap area between bbox1 and bbox2
  161. to the area of the smaller bounding box.
  162. Args:
  163. bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
  164. bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
  165. Returns:
  166. float: The ratio of the overlap area to the area of the smaller bounding box.
  167. """
  168. bbox1 = list(map(int, bbox1))
  169. bbox2 = list(map(int, bbox2))
  170. x_left = max(bbox1[0], bbox2[0])
  171. y_top = max(bbox1[1], bbox2[1])
  172. x_right = min(bbox1[2], bbox2[2])
  173. y_bottom = min(bbox1[3], bbox2[3])
  174. if x_right <= x_left or y_bottom <= y_top:
  175. return 0.0
  176. intersection_area = (x_right - x_left) * (y_bottom - y_top)
  177. area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
  178. area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
  179. min_box_area = min(area_bbox1, area_bbox2)
  180. if min_box_area <= 0:
  181. return 0.0
  182. return intersection_area / min_box_area
  183. def _whether_y_overlap_exceeds_threshold(
  184. bbox1: Union[list, tuple],
  185. bbox2: Union[list, tuple],
  186. overlap_ratio_threshold: float = 0.6,
  187. ) -> bool:
  188. """
  189. Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
  190. Args:
  191. bbox1 (list or tuple): The first bounding box defined as (left, top, right, bottom).
  192. bbox2 (list or tuple): The second bounding box defined as (left, top, right, bottom).
  193. overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
  194. Defaults to 0.6.
  195. Returns:
  196. bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
  197. exceeds the overlap_ratio_threshold, otherwise False.
  198. """
  199. _, y1_0, _, y1_1 = bbox1
  200. _, y2_0, _, y2_1 = bbox2
  201. overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
  202. min_height = min(y1_1 - y1_0, y2_1 - y2_0)
  203. return (overlap / min_height) > overlap_ratio_threshold
  204. def _adjust_span_text(span: List[str], prepend: bool = False, append: bool = False):
  205. """
  206. Adjust the text of a span by prepending or appending a newline.
  207. Args:
  208. span (list): A list where the second element is the text of the span.
  209. prepend (bool): If True, prepend a newline to the text.
  210. append (bool): If True, append a newline to the text.
  211. Returns:
  212. None: The function modifies the span in place.
  213. """
  214. if prepend:
  215. span[1] = "\n" + span[1]
  216. if append:
  217. span[1] = span[1] + "\n"
  218. return span
  219. def _format_line(
  220. line: List[List[Union[List[int], str]]],
  221. layout_min: int,
  222. layout_max: int,
  223. is_reference: bool = False,
  224. ) -> None:
  225. """
  226. Format a line of text spans based on layout constraints.
  227. Args:
  228. line (list): A list of spans, where each span is a list containing a bounding box and text.
  229. layout_min (int): The minimum x-coordinate of the layout bounding box.
  230. layout_max (int): The maximum x-coordinate of the layout bounding box.
  231. is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.
  232. Returns:
  233. None: The function modifies the line in place.
  234. """
  235. first_span = line[0]
  236. end_span = line[-1]
  237. if not is_reference:
  238. if first_span[0][0] - layout_min > 10:
  239. first_span = _adjust_span_text(first_span, prepend=True)
  240. if layout_max - end_span[0][2] > 10:
  241. end_span = _adjust_span_text(end_span, append=True)
  242. else:
  243. if first_span[0][0] - layout_min < 5:
  244. first_span = _adjust_span_text(first_span, prepend=True)
  245. if layout_max - end_span[0][2] > 20:
  246. end_span = _adjust_span_text(end_span, append=True)
  247. line[0] = first_span
  248. line[-1] = end_span
  249. return line
  250. def split_boxes_if_x_contained(boxes, offset=1e-5):
  251. """
  252. Check if there is any complete containment in the x-direction
  253. between the bounding boxes and split the containing box accordingly.
  254. Args:
  255. boxes (list of lists): Each element is a list containing an ndarray of length 4, a description, and a label.
  256. offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
  257. Returns:
  258. A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
  259. """
  260. def is_x_contained(box_a, box_b):
  261. """Check if box_a completely contains box_b in the x-direction."""
  262. return box_a[0][0] <= box_b[0][0] and box_a[0][2] >= box_b[0][2]
  263. new_boxes = []
  264. for i in range(len(boxes)):
  265. box_a = boxes[i]
  266. is_split = False
  267. for j in range(len(boxes)):
  268. if i == j:
  269. continue
  270. box_b = boxes[j]
  271. if is_x_contained(box_a, box_b):
  272. is_split = True
  273. # Split box_a based on the x-coordinates of box_b
  274. if box_a[0][0] < box_b[0][0]:
  275. w = box_b[0][0] - offset - box_a[0][0]
  276. if w > 1:
  277. new_boxes.append(
  278. [
  279. np.array(
  280. [
  281. box_a[0][0],
  282. box_a[0][1],
  283. box_b[0][0] - offset,
  284. box_a[0][3],
  285. ]
  286. ),
  287. box_a[1],
  288. box_a[2],
  289. ]
  290. )
  291. if box_a[0][2] > box_b[0][2]:
  292. w = box_a[0][2] - box_b[0][2] + offset
  293. if w > 1:
  294. box_a = [
  295. np.array(
  296. [
  297. box_b[0][2] + offset,
  298. box_a[0][1],
  299. box_a[0][2],
  300. box_a[0][3],
  301. ]
  302. ),
  303. box_a[1],
  304. box_a[2],
  305. ]
  306. if j == len(boxes) - 1 and is_split:
  307. new_boxes.append(box_a)
  308. if not is_split:
  309. new_boxes.append(box_a)
  310. return new_boxes
  311. def _sort_line_by_x_projection(
  312. input_img: np.ndarray,
  313. general_ocr_pipeline: Any,
  314. line: List[List[Union[List[int], str]]],
  315. ) -> None:
  316. """
  317. Sort a line of text spans based on their vertical position within the layout bounding box.
  318. Args:
  319. input_img (ndarray): The input image used for OCR.
  320. general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
  321. line (list): A list of spans, where each span is a list containing a bounding box and text.
  322. Returns:
  323. list: The sorted line of text spans.
  324. """
  325. splited_boxes = split_boxes_if_x_contained(line)
  326. splited_lines = []
  327. if len(line) != len(splited_boxes):
  328. splited_boxes.sort(key=lambda span: span[0][0])
  329. text_rec_model = general_ocr_pipeline.text_rec_model
  330. for span in splited_boxes:
  331. if span[2] == "text":
  332. crop_img = input_img[
  333. int(span[0][1]) : int(span[0][3]),
  334. int(span[0][0]) : int(span[0][2]),
  335. ]
  336. span[1] = next(text_rec_model([crop_img]))["rec_text"]
  337. splited_lines.append(span)
  338. else:
  339. splited_lines = line
  340. return splited_lines
  341. def _sort_ocr_res_by_y_projection(
  342. input_img: np.ndarray,
  343. general_ocr_pipeline: Any,
  344. label: Any,
  345. block_bbox: Tuple[int, int, int, int],
  346. ocr_res: Dict[str, List[Any]],
  347. line_height_iou_threshold: float = 0.7,
  348. ) -> Dict[str, List[Any]]:
  349. """
  350. Sorts OCR results based on their spatial arrangement, grouping them into lines and blocks.
  351. Args:
  352. input_img (ndarray): The input image used for OCR.
  353. general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
  354. label (Any): The label associated with the OCR results. It's not used in the function but might be
  355. relevant for other parts of the calling context.
  356. block_bbox (Tuple[int, int, int, int]): A tuple representing the layout bounding box, defined as
  357. (left, top, right, bottom).
  358. ocr_res (Dict[str, List[Any]]): A dictionary containing OCR results with the following keys:
  359. - "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
  360. - "rec_texts": A corresponding list of recognized text strings for each box.
  361. line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
  362. the same line based on their vertical overlap. Defaults to 0.7.
  363. Returns:
  364. Dict[str, List[Any]]: A dictionary with the same structure as `ocr_res`, but with boxes and texts sorted
  365. and grouped into lines and blocks.
  366. """
  367. assert (
  368. ocr_res["boxes"] and ocr_res["rec_texts"]
  369. ), "OCR results must contain 'boxes' and 'rec_texts'"
  370. boxes = ocr_res["boxes"]
  371. rec_texts = ocr_res["rec_texts"]
  372. rec_labels = ocr_res["rec_labels"]
  373. x_min, _, x_max, _ = block_bbox
  374. inline_x_min = min([box[0] for box in boxes])
  375. inline_x_max = max([box[2] for box in boxes])
  376. spans = list(zip(boxes, rec_texts, rec_labels))
  377. spans.sort(key=lambda span: span[0][1])
  378. spans = [list(span) for span in spans]
  379. lines = []
  380. current_line = [spans[0]]
  381. current_y0, current_y1 = spans[0][0][1], spans[0][0][3]
  382. for span in spans[1:]:
  383. y0, y1 = span[0][1], span[0][3]
  384. if _whether_y_overlap_exceeds_threshold(
  385. (0, current_y0, 0, current_y1),
  386. (0, y0, 0, y1),
  387. line_height_iou_threshold,
  388. ):
  389. current_line.append(span)
  390. current_y0 = min(current_y0, y0)
  391. current_y1 = max(current_y1, y1)
  392. else:
  393. lines.append(current_line)
  394. current_line = [span]
  395. current_y0, current_y1 = y0, y1
  396. if current_line:
  397. lines.append(current_line)
  398. new_lines = []
  399. for line in lines:
  400. line.sort(key=lambda span: span[0][0])
  401. ocr_labels = [span[2] for span in line]
  402. if "formula" in ocr_labels:
  403. line = _sort_line_by_x_projection(input_img, general_ocr_pipeline, line)
  404. if label == "reference":
  405. line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
  406. elif label != "content":
  407. line = _format_line(line, x_min, x_max)
  408. new_lines.append(line)
  409. ocr_res["boxes"] = [span[0] for line in new_lines for span in line]
  410. if label == "content":
  411. ocr_res["rec_texts"] = [
  412. "".join(f"{span[1]} " for span in line).rstrip() for line in new_lines
  413. ]
  414. else:
  415. ocr_res["rec_texts"] = [span[1] + " " for line in new_lines for span in line]
  416. return ocr_res, len(new_lines)
  417. def _process_text(input_text: str) -> str:
  418. """
  419. Process the input text to handle spaces.
  420. The function removes multiple consecutive spaces between Chinese characters and ensures that
  421. only a single space is retained between Chinese and non-Chinese characters.
  422. Args:
  423. input_text (str): The text to be processed.
  424. Returns:
  425. str: The processed text with properly formatted spaces.
  426. """
  427. def handle_spaces_(text: str) -> str:
  428. """
  429. Handle spaces in the text by removing multiple consecutive spaces and inserting a single space
  430. between Chinese and non-Chinese characters.
  431. Args:
  432. text (str): The text to handle spaces for.
  433. Returns:
  434. str: The text with properly formatted spaces.
  435. """
  436. spaces = re.finditer(r"\s+", text)
  437. processed_text = list(text)
  438. for space in reversed(list(spaces)):
  439. start, end = space.span()
  440. prev_char = processed_text[start - 1] if start > 0 else ""
  441. next_char = processed_text[end] if end < len(processed_text) else ""
  442. is_prev_chinese = (
  443. re.match(r"[\u4e00-\u9fff]", prev_char) if prev_char else False
  444. )
  445. is_next_chinese = (
  446. re.match(r"[\u4e00-\u9fff]", next_char) if next_char else False
  447. )
  448. if is_prev_chinese and is_next_chinese:
  449. processed_text[start:end] = []
  450. else:
  451. processed_text[start:end] = [" "]
  452. return "".join(processed_text)
  453. text_without_spaces = handle_spaces_(input_text)
  454. final_text = re.sub(r"\s+", " ", text_without_spaces).strip()
  455. return final_text
  456. def get_single_block_parsing_res(
  457. general_ocr_pipeline: Any,
  458. overall_ocr_res: OCRResult,
  459. layout_det_res: DetResult,
  460. table_res_list: list,
  461. seal_res_list: list,
  462. ) -> OCRResult:
  463. """
  464. Extract structured information from OCR and layout detection results.
  465. Args:
  466. overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
  467. - "input_img": The image on which OCR was performed.
  468. - "dt_boxes": A list of detected text box coordinates.
  469. - "rec_texts": A list of recognized text corresponding to the detected boxes.
  470. layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
  471. - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
  472. table_res_list (list): A list of table detection results, where each item is a dictionary containing:
  473. - "block_bbox": The bounding box of the table layout.
  474. - "pred_html": The predicted HTML representation of the table.
  475. seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
  476. Returns:
  477. list: A list of structured boxes where each item is a dictionary containing:
  478. - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
  479. - The label as a key with either table HTML or image data and text.
  480. - "block_bbox": The coordinates of the layout box.
  481. """
  482. single_block_layout_parsing_res = []
  483. input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]
  484. seal_index = 0
  485. with_doc_title = False
  486. max_block_area = 0.0
  487. paragraph_title_indexs = []
  488. layout_det_res_list, _ = _remove_overlap_blocks(
  489. deepcopy(layout_det_res["boxes"]),
  490. threshold=0.5,
  491. smaller=True,
  492. )
  493. for box_idx, box_info in enumerate(layout_det_res_list):
  494. block_bbox = box_info["coordinate"]
  495. label = box_info["label"]
  496. rec_res = {"boxes": [], "rec_texts": [], "rec_labels": [], "flag": False}
  497. seg_start_coordinate = float("inf")
  498. seg_end_coordinate = float("-inf")
  499. num_of_lines = 1
  500. if label == "doc_title":
  501. with_doc_title = True
  502. elif label == "paragraph_title":
  503. paragraph_title_indexs.append(box_idx)
  504. block_area = (block_bbox[2] - block_bbox[0]) * (block_bbox[3] - block_bbox[1])
  505. max_block_area = max(max_block_area, block_area)
  506. if label == "table":
  507. for table_res in table_res_list:
  508. if len(table_res["cell_box_list"]) == 0:
  509. continue
  510. if (
  511. _calculate_overlap_area_div_minbox_area_ratio(
  512. block_bbox, table_res["cell_box_list"][0]
  513. )
  514. > 0.5
  515. ):
  516. single_block_layout_parsing_res.append(
  517. {
  518. "block_label": label,
  519. "block_content": table_res["pred_html"],
  520. "block_bbox": block_bbox,
  521. },
  522. )
  523. break
  524. elif label == "seal":
  525. if len(seal_res_list) > 0:
  526. single_block_layout_parsing_res.append(
  527. {
  528. "block_label": label,
  529. "block_content": _process_text(
  530. ", ".join(seal_res_list[seal_index]["rec_texts"])
  531. ),
  532. "block_bbox": block_bbox,
  533. },
  534. )
  535. seal_index += 1
  536. else:
  537. overall_text_boxes = overall_ocr_res["rec_boxes"]
  538. for box_no in range(len(overall_text_boxes)):
  539. if (
  540. _calculate_overlap_area_div_minbox_area_ratio(
  541. block_bbox, overall_text_boxes[box_no]
  542. )
  543. > 0.5
  544. ):
  545. rec_res["boxes"].append(overall_text_boxes[box_no])
  546. rec_res["rec_texts"].append(
  547. overall_ocr_res["rec_texts"][box_no],
  548. )
  549. rec_res["rec_labels"].append(
  550. overall_ocr_res["rec_labels"][box_no],
  551. )
  552. rec_res["flag"] = True
  553. if rec_res["flag"]:
  554. rec_res, num_of_lines = _sort_ocr_res_by_y_projection(
  555. input_img, general_ocr_pipeline, label, block_bbox, rec_res, 0.7
  556. )
  557. seg_start_coordinate = rec_res["boxes"][0][0]
  558. seg_end_coordinate = rec_res["boxes"][-1][2]
  559. if label == "formula":
  560. rec_res["rec_texts"] = [
  561. rec_res_text.replace("$", "")
  562. for rec_res_text in rec_res["rec_texts"]
  563. ]
  564. if label in ["chart", "image"]:
  565. x_min, y_min, x_max, y_max = list(map(int, block_bbox))
  566. img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
  567. img = Image.fromarray(input_img[y_min:y_max, x_min:x_max, ::-1])
  568. single_block_layout_parsing_res.append(
  569. {
  570. "block_label": label,
  571. "block_content": _process_text("".join(rec_res["rec_texts"])),
  572. "block_image": {img_path: img},
  573. "block_bbox": block_bbox,
  574. },
  575. )
  576. else:
  577. if label in ["doc_title"]:
  578. content = " ".join(rec_res["rec_texts"])
  579. elif label in ["content"]:
  580. content = "\n".join(rec_res["rec_texts"])
  581. else:
  582. content = "".join(rec_res["rec_texts"])
  583. if label != "reference":
  584. content = _process_text(content)
  585. single_block_layout_parsing_res.append(
  586. {
  587. "block_label": label,
  588. "block_content": content,
  589. "block_bbox": block_bbox,
  590. "seg_start_coordinate": seg_start_coordinate,
  591. "seg_end_coordinate": seg_end_coordinate,
  592. "num_of_lines": num_of_lines,
  593. "block_area": block_area,
  594. },
  595. )
  596. if (
  597. not with_doc_title
  598. and len(paragraph_title_indexs) == 1
  599. and single_block_layout_parsing_res[paragraph_title_indexs[0]].get(
  600. "block_area", 0
  601. )
  602. > max_block_area * 0.3
  603. ):
  604. single_block_layout_parsing_res[paragraph_title_indexs[0]][
  605. "block_label"
  606. ] = "doc_title"
  607. if len(layout_det_res_list) == 0:
  608. for ocr_rec_box, ocr_rec_text in zip(
  609. overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"]
  610. ):
  611. single_block_layout_parsing_res.append(
  612. {
  613. "block_label": "text",
  614. "block_content": ocr_rec_text,
  615. "block_bbox": ocr_rec_box,
  616. "seg_start_coordinate": ocr_rec_box[0],
  617. "seg_end_coordinate": ocr_rec_box[2],
  618. },
  619. )
  620. single_block_layout_parsing_res = get_layout_ordering(
  621. single_block_layout_parsing_res,
  622. no_mask_labels=[
  623. "text",
  624. "formula",
  625. "algorithm",
  626. "reference",
  627. "content",
  628. "abstract",
  629. ],
  630. )
  631. return single_block_layout_parsing_res
  632. def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
  633. """
  634. Generate a 1D projection histogram from bounding boxes along a specified axis.
  635. Args:
  636. boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
  637. axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
  638. Returns:
  639. A 1D numpy array representing the projection histogram based on bounding box intervals.
  640. """
  641. assert axis in [0, 1]
  642. max_length = np.max(boxes[:, axis::2])
  643. projection = np.zeros(max_length, dtype=int)
  644. # Increment projection histogram over the interval defined by each bounding box
  645. for start, end in boxes[:, axis::2]:
  646. projection[start:end] += 1
  647. return projection
  648. def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
  649. """
  650. Split the projection profile into segments based on specified thresholds.
  651. Args:
  652. arr_values: 1D array representing the projection profile.
  653. min_value: Minimum value threshold to consider a profile segment significant.
  654. min_gap: Minimum gap width to consider a separation between segments.
  655. Returns:
  656. A tuple of start and end indices for each segment that meets the criteria.
  657. """
  658. # Identify indices where the projection exceeds the minimum value
  659. significant_indices = np.where(arr_values > min_value)[0]
  660. if not len(significant_indices):
  661. return
  662. # Calculate gaps between significant indices
  663. index_diffs = significant_indices[1:] - significant_indices[:-1]
  664. gap_indices = np.where(index_diffs > min_gap)[0]
  665. # Determine start and end indices of segments
  666. segment_starts = np.insert(
  667. significant_indices[gap_indices + 1],
  668. 0,
  669. significant_indices[0],
  670. )
  671. segment_ends = np.append(
  672. significant_indices[gap_indices],
  673. significant_indices[-1] + 1,
  674. )
  675. return segment_starts, segment_ends
  676. def _recursive_yx_cut(
  677. boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
  678. ):
  679. """
  680. Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
  681. Args:
  682. boxes: A (N, 4) array representing bounding boxes.
  683. indices: List of indices indicating the original position of boxes.
  684. res: List to store indices of the final segmented bounding boxes.
  685. min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
  686. Returns:
  687. None: This function modifies the `res` list in place.
  688. """
  689. assert len(boxes) == len(
  690. indices
  691. ), "The length of boxes and indices must be the same."
  692. # Sort by y_min for Y-axis projection
  693. y_sorted_indices = boxes[:, 1].argsort()
  694. y_sorted_boxes = boxes[y_sorted_indices]
  695. y_sorted_indices = np.array(indices)[y_sorted_indices]
  696. # Perform Y-axis projection
  697. y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
  698. y_intervals = _split_projection_profile(y_projection, 0, 1)
  699. if not y_intervals:
  700. return
  701. # Process each segment defined by Y-axis projection
  702. for y_start, y_end in zip(*y_intervals):
  703. # Select boxes within the current y interval
  704. y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
  705. y_sorted_boxes[:, 1] < y_end
  706. )
  707. y_boxes_chunk = y_sorted_boxes[y_interval_indices]
  708. y_indices_chunk = y_sorted_indices[y_interval_indices]
  709. # Sort by x_min for X-axis projection
  710. x_sorted_indices = y_boxes_chunk[:, 0].argsort()
  711. x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
  712. x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
  713. # Perform X-axis projection
  714. x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
  715. x_intervals = _split_projection_profile(x_projection, 0, min_gap)
  716. if not x_intervals:
  717. continue
  718. # If X-axis cannot be further segmented, add current indices to results
  719. if len(x_intervals[0]) == 1:
  720. res.extend(x_sorted_indices_chunk)
  721. continue
  722. # Recursively process each segment defined by X-axis projection
  723. for x_start, x_end in zip(*x_intervals):
  724. x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
  725. x_sorted_boxes_chunk[:, 0] < x_end
  726. )
  727. _recursive_yx_cut(
  728. x_sorted_boxes_chunk[x_interval_indices],
  729. x_sorted_indices_chunk[x_interval_indices],
  730. res,
  731. )
  732. def _recursive_xy_cut(
  733. boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
  734. ):
  735. """
  736. Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
  737. Args:
  738. boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
  739. indices: A list of indices representing the position of boxes in the original data.
  740. res: A list to store indices of bounding boxes that meet the criteria.
  741. min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
  742. Returns:
  743. None: This function modifies the `res` list in place.
  744. """
  745. # Ensure boxes and indices have the same length
  746. assert len(boxes) == len(
  747. indices
  748. ), "The length of boxes and indices must be the same."
  749. # Sort by x_min to prepare for X-axis projection
  750. x_sorted_indices = boxes[:, 0].argsort()
  751. x_sorted_boxes = boxes[x_sorted_indices]
  752. x_sorted_indices = np.array(indices)[x_sorted_indices]
  753. # Perform X-axis projection
  754. x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
  755. x_intervals = _split_projection_profile(x_projection, 0, 1)
  756. if not x_intervals:
  757. return
  758. # Process each segment defined by X-axis projection
  759. for x_start, x_end in zip(*x_intervals):
  760. # Select boxes within the current x interval
  761. x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
  762. x_sorted_boxes[:, 0] < x_end
  763. )
  764. x_boxes_chunk = x_sorted_boxes[x_interval_indices]
  765. x_indices_chunk = x_sorted_indices[x_interval_indices]
  766. # Sort selected boxes by y_min to prepare for Y-axis projection
  767. y_sorted_indices = x_boxes_chunk[:, 1].argsort()
  768. y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
  769. y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
  770. # Perform Y-axis projection
  771. y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
  772. y_intervals = _split_projection_profile(y_projection, 0, min_gap)
  773. if not y_intervals:
  774. continue
  775. # If Y-axis cannot be further segmented, add current indices to results
  776. if len(y_intervals[0]) == 1:
  777. res.extend(y_sorted_indices_chunk)
  778. continue
  779. # Recursively process each segment defined by Y-axis projection
  780. for y_start, y_end in zip(*y_intervals):
  781. y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
  782. y_sorted_boxes_chunk[:, 1] < y_end
  783. )
  784. _recursive_xy_cut(
  785. y_sorted_boxes_chunk[y_interval_indices],
  786. y_sorted_indices_chunk[y_interval_indices],
  787. res,
  788. )
  789. def sort_by_xycut(
  790. block_bboxes: Union[np.ndarray, List[List[int]]],
  791. direction: int = 0,
  792. min_gap: int = 1,
  793. ) -> List[int]:
  794. """
  795. Sort bounding boxes using recursive XY cut method based on the specified direction.
  796. Args:
  797. block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
  798. where each box is represented as
  799. [x_min, y_min, x_max, y_max].
  800. direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
  801. Defaults to 0.
  802. min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
  803. Returns:
  804. List[int]: A list of indices representing the order of sorted bounding boxes.
  805. """
  806. block_bboxes = np.asarray(block_bboxes).astype(int)
  807. res = []
  808. if direction == 1:
  809. _recursive_yx_cut(
  810. block_bboxes,
  811. np.arange(len(block_bboxes)).tolist(),
  812. res,
  813. min_gap,
  814. )
  815. else:
  816. _recursive_xy_cut(
  817. block_bboxes,
  818. np.arange(len(block_bboxes)).tolist(),
  819. res,
  820. min_gap,
  821. )
  822. return res
  823. def gather_imgs(original_img, layout_det_objs):
  824. imgs_in_doc = []
  825. for det_obj in layout_det_objs:
  826. if det_obj["label"] in ("image", "chart"):
  827. x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
  828. img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
  829. img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
  830. imgs_in_doc.append(
  831. {
  832. "path": img_path,
  833. "img": img,
  834. "coordinate": (x_min, y_min, x_max, y_max),
  835. "score": det_obj["score"],
  836. }
  837. )
  838. return imgs_in_doc
  839. def _get_minbox_if_overlap_by_ratio(
  840. bbox1: Union[List[int], Tuple[int, int, int, int]],
  841. bbox2: Union[List[int], Tuple[int, int, int, int]],
  842. ratio: float,
  843. smaller: bool = True,
  844. ) -> Optional[Union[List[int], Tuple[int, int, int, int]]]:
  845. """
  846. Determine if the overlap area between two bounding boxes exceeds a given ratio
  847. and return the smaller (or larger) bounding box based on the `smaller` flag.
  848. Args:
  849. bbox1 (Union[List[int], Tuple[int, int, int, int]]): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
  850. bbox2 (Union[List[int], Tuple[int, int, int, int]]): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
  851. ratio (float): The overlap ratio threshold.
  852. smaller (bool): If True, return the smaller bounding box; otherwise, return the larger one.
  853. Returns:
  854. Optional[Union[List[int], Tuple[int, int, int, int]]]:
  855. The selected bounding box or None if the overlap ratio is not exceeded.
  856. """
  857. # Calculate the areas of both bounding boxes
  858. area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
  859. area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
  860. # Calculate the overlap ratio using a helper function
  861. overlap_ratio = _calculate_overlap_area_div_minbox_area_ratio(bbox1, bbox2)
  862. # Check if the overlap ratio exceeds the threshold
  863. if overlap_ratio > ratio:
  864. if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
  865. return 1
  866. else:
  867. return 2
  868. return None
  869. def _remove_overlap_blocks(
  870. blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
  871. ) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
  872. """
  873. Remove overlapping blocks based on a specified overlap ratio threshold.
  874. Args:
  875. blocks (List[Dict[str, List[int]]]): List of block dictionaries, each containing a 'block_bbox' key.
  876. threshold (float): Ratio threshold to determine significant overlap.
  877. smaller (bool): If True, the smaller block in overlap is removed.
  878. Returns:
  879. Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
  880. A tuple containing the updated list of blocks and a list of dropped blocks.
  881. """
  882. dropped_blocks = []
  883. dropped_indexes = set()
  884. # Iterate over each pair of blocks to find overlaps
  885. for i, block1 in enumerate(blocks):
  886. for j in range(i + 1, len(blocks)):
  887. block2 = blocks[j]
  888. # Skip blocks that are already marked for removal
  889. if i in dropped_indexes or j in dropped_indexes:
  890. continue
  891. # Check for overlap and determine which block to remove
  892. overlap_box_index = _get_minbox_if_overlap_by_ratio(
  893. block1["coordinate"],
  894. block2["coordinate"],
  895. threshold,
  896. smaller=smaller,
  897. )
  898. if overlap_box_index is not None:
  899. # Determine which block to remove based on overlap_box_index
  900. if overlap_box_index == 1:
  901. drop_index = i
  902. else:
  903. drop_index = j
  904. dropped_indexes.add(drop_index)
  905. # Remove marked blocks from the original list
  906. for index in sorted(dropped_indexes, reverse=True):
  907. dropped_blocks.append(blocks[index])
  908. del blocks[index]
  909. return blocks, dropped_blocks
  910. def _get_text_median_width(blocks: List[Dict[str, any]]) -> float:
  911. """
  912. Calculate the median width of blocks labeled as "text".
  913. Args:
  914. blocks (List[Dict[str, any]]): List of block dictionaries, each containing a 'block_bbox' and 'label'.
  915. Returns:
  916. float: The median width of text blocks, or infinity if no text blocks are found.
  917. """
  918. widths = [
  919. block["block_bbox"][2] - block["block_bbox"][0]
  920. for block in blocks
  921. if block.get("block_label") == "text"
  922. ]
  923. return np.median(widths) if widths else float("inf")
  924. def _get_layout_property(
  925. blocks: List[Dict[str, any]],
  926. median_width: float,
  927. no_mask_labels: List[str],
  928. threshold: float = 0.8,
  929. ) -> Tuple[List[Dict[str, any]], bool]:
  930. """
  931. Determine the layout (single or double column) of text blocks.
  932. Args:
  933. blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
  934. median_width (float): Median width of text blocks.
  935. no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
  936. threshold (float): Threshold for determining layout overlap.
  937. Returns:
  938. Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
  939. indicating if the double layout area is greater than the single layout area.
  940. """
  941. blocks.sort(
  942. key=lambda x: (
  943. x["block_bbox"][0],
  944. (x["block_bbox"][2] - x["block_bbox"][0]),
  945. ),
  946. )
  947. check_single_layout = {}
  948. page_min_x, page_max_x = float("inf"), 0
  949. double_label_area = 0
  950. single_label_area = 0
  951. for i, block in enumerate(blocks):
  952. page_min_x = min(page_min_x, block["block_bbox"][0])
  953. page_max_x = max(page_max_x, block["block_bbox"][2])
  954. page_width = page_max_x - page_min_x
  955. for i, block in enumerate(blocks):
  956. if block["block_label"] not in no_mask_labels:
  957. continue
  958. x_min_i, _, x_max_i, _ = block["block_bbox"]
  959. layout_length = x_max_i - x_min_i
  960. cover_count, cover_with_threshold_count = 0, 0
  961. match_block_with_threshold_indexes = []
  962. for j, other_block in enumerate(blocks):
  963. if i == j or other_block["block_label"] not in no_mask_labels:
  964. continue
  965. x_min_j, _, x_max_j, _ = other_block["block_bbox"]
  966. x_match_min, x_match_max = max(
  967. x_min_i,
  968. x_min_j,
  969. ), min(x_max_i, x_max_j)
  970. match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)
  971. if match_block_iou > 0:
  972. cover_count += 1
  973. if match_block_iou > threshold:
  974. cover_with_threshold_count += 1
  975. match_block_with_threshold_indexes.append(
  976. (j, match_block_iou),
  977. )
  978. x_min_i = x_match_max
  979. if x_min_i >= x_max_i:
  980. break
  981. if (
  982. layout_length > median_width * 1.3
  983. and (cover_with_threshold_count >= 2 or cover_count >= 2)
  984. ) or layout_length > 0.6 * page_width:
  985. # if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
  986. block["layout"] = "double"
  987. double_label_area += (block["block_bbox"][2] - block["block_bbox"][0]) * (
  988. block["block_bbox"][3] - block["block_bbox"][1]
  989. )
  990. else:
  991. block["layout"] = "single"
  992. check_single_layout[i] = match_block_with_threshold_indexes
  993. # Check single-layout block
  994. for i, single_layout in check_single_layout.items():
  995. if single_layout:
  996. index, match_iou = single_layout[-1]
  997. if match_iou > 0.9 and blocks[index]["layout"] == "double":
  998. blocks[i]["layout"] = "double"
  999. double_label_area += (
  1000. blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
  1001. ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
  1002. else:
  1003. single_label_area += (
  1004. blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
  1005. ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
  1006. return blocks, (double_label_area > single_label_area)
  1007. def _get_bbox_direction(input_bbox: List[float], ratio: float = 1.0) -> bool:
  1008. """
  1009. Determine if a bounding box is horizontal or vertical.
  1010. Args:
  1011. input_bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
  1012. ratio (float): Ratio for determining orientation. Default is 1.0.
  1013. Returns:
  1014. bool: True if the bounding box is considered horizontal, False if vertical.
  1015. """
  1016. width = input_bbox[2] - input_bbox[0]
  1017. height = input_bbox[3] - input_bbox[1]
  1018. return width * ratio >= height
  1019. def _get_projection_iou(
  1020. input_bbox: List[float], match_bbox: List[float], is_horizontal: bool = True
  1021. ) -> float:
  1022. """
  1023. Calculate the IoU of lines between two bounding boxes.
  1024. Args:
  1025. input_bbox (List[float]): First bounding box [x_min, y_min, x_max, y_max].
  1026. match_bbox (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
  1027. is_horizontal (bool): Whether to compare horizontally or vertically.
  1028. Returns:
  1029. float: Line IoU. Returns 0 if there is no overlap.
  1030. """
  1031. if is_horizontal:
  1032. x_match_min = max(input_bbox[0], match_bbox[0])
  1033. x_match_max = min(input_bbox[2], match_bbox[2])
  1034. overlap = max(0, x_match_max - x_match_min)
  1035. input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
  1036. else:
  1037. y_match_min = max(input_bbox[1], match_bbox[1])
  1038. y_match_max = min(input_bbox[3], match_bbox[3])
  1039. overlap = max(0, y_match_max - y_match_min)
  1040. input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
  1041. return overlap / input_width if input_width > 0 else 0.0
  1042. def _get_sub_category(
  1043. blocks: List[Dict[str, Any]], title_labels: List[str]
  1044. ) -> Tuple[List[Dict[str, Any]], List[float]]:
  1045. """
  1046. Determine the layout of title and text blocks and collect pre_cuts.
  1047. Args:
  1048. blocks (List[Dict[str, Any]]): List of block dictionaries.
  1049. title_labels (List[str]): List of labels considered as titles.
  1050. Returns:
  1051. List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
  1052. Dict[float]: Dict of pre_cuts coordinates.
  1053. """
  1054. sub_title_labels = ["paragraph_title"]
  1055. vision_labels = ["image", "table", "chart", "figure"]
  1056. vision_title_labels = ["figure_title", "chart_title", "table_title"]
  1057. all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
  1058. special_pre_cut_labels = sub_title_labels
  1059. # single doc title is irregular,pre cut not applicable
  1060. num_doc_title = 0
  1061. for block in blocks:
  1062. if block["block_label"] == "doc_title":
  1063. num_doc_title += 1
  1064. if num_doc_title == 2:
  1065. special_pre_cut_labels = title_labels + sub_title_labels
  1066. break
  1067. if len(blocks) == 0:
  1068. return blocks, {}
  1069. min_x = min(block["block_bbox"][0] for block in blocks)
  1070. min_y = min(block["block_bbox"][1] for block in blocks)
  1071. max_x = max(block["block_bbox"][2] for block in blocks)
  1072. max_y = max(block["block_bbox"][3] for block in blocks)
  1073. region_bbox = (min_x, min_y, max_x, max_y)
  1074. region_x_center = (region_bbox[0] + region_bbox[2]) / 2
  1075. region_y_center = (region_bbox[1] + region_bbox[3]) / 2
  1076. region_width = region_bbox[2] - region_bbox[0]
  1077. region_height = region_bbox[3] - region_bbox[1]
  1078. pre_cuts = {}
  1079. for i, block1 in enumerate(blocks):
  1080. block1.setdefault("title_text", [])
  1081. block1.setdefault("sub_title", [])
  1082. block1.setdefault("vision_footnote", [])
  1083. block1.setdefault("sub_label", block1["block_label"])
  1084. if block1["block_label"] not in all_labels:
  1085. continue
  1086. bbox1 = block1["block_bbox"]
  1087. x1, y1, x2, y2 = bbox1
  1088. is_horizontal_1 = _get_bbox_direction(block1["block_bbox"])
  1089. left_up_title_text_distance = float("inf")
  1090. left_up_title_text_index = -1
  1091. left_up_title_text_direction = None
  1092. right_down_title_text_distance = float("inf")
  1093. right_down_title_text_index = -1
  1094. right_down_title_text_direction = None
  1095. # pre-cuts
  1096. # Condition 1: Length is greater than half of the layout region
  1097. if is_horizontal_1:
  1098. block_length = x2 - x1
  1099. required_length = region_width / 2
  1100. else:
  1101. block_length = y2 - y1
  1102. required_length = region_height / 2
  1103. if block1["block_label"] in special_pre_cut_labels:
  1104. length_condition = True
  1105. else:
  1106. length_condition = block_length > required_length
  1107. # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
  1108. block_x_center = (x1 + x2) / 2
  1109. block_y_center = (y1 + y2) / 2
  1110. tolerance_len = block_length // 5
  1111. if block1["block_label"] in special_pre_cut_labels:
  1112. tolerance_len = block_length // 10
  1113. if is_horizontal_1:
  1114. is_centered = abs(block_x_center - region_x_center) <= tolerance_len
  1115. else:
  1116. is_centered = abs(block_y_center - region_y_center) <= tolerance_len
  1117. # Condition 3: Check for surrounding text
  1118. has_left_text = False
  1119. has_right_text = False
  1120. has_above_text = False
  1121. has_below_text = False
  1122. for block2 in blocks:
  1123. if block2["block_label"] != "text":
  1124. continue
  1125. bbox2 = block2["block_bbox"]
  1126. x1_2, y1_2, x2_2, y2_2 = bbox2
  1127. if is_horizontal_1:
  1128. if x2_2 <= x1 and not (y2_2 <= y1 or y1_2 >= y2):
  1129. has_left_text = True
  1130. if x1_2 >= x2 and not (y2_2 <= y1 or y1_2 >= y2):
  1131. has_right_text = True
  1132. else:
  1133. if y2_2 <= y1 and not (x2_2 <= x1 or x1_2 >= x2):
  1134. has_above_text = True
  1135. if y1_2 >= y2 and not (x2_2 <= x1 or x1_2 >= x2):
  1136. has_below_text = True
  1137. if (is_horizontal_1 and has_left_text and has_right_text) or (
  1138. not is_horizontal_1 and has_above_text and has_below_text
  1139. ):
  1140. break
  1141. no_text_on_sides = (
  1142. not (has_left_text or has_right_text)
  1143. if is_horizontal_1
  1144. else not (has_above_text or has_below_text)
  1145. )
  1146. # Add coordinates if all conditions are met
  1147. if is_centered and length_condition and no_text_on_sides:
  1148. if is_horizontal_1:
  1149. pre_cuts.setdefault("y", []).append(y1)
  1150. else:
  1151. pre_cuts.setdefault("x", []).append(x1)
  1152. for j, block2 in enumerate(blocks):
  1153. if i == j:
  1154. continue
  1155. bbox2 = block2["block_bbox"]
  1156. x1_prime, y1_prime, x2_prime, y2_prime = bbox2
  1157. is_horizontal_2 = _get_bbox_direction(bbox2)
  1158. match_block_iou = _get_projection_iou(
  1159. bbox2,
  1160. bbox1,
  1161. is_horizontal_1,
  1162. )
  1163. def distance_(is_horizontal, is_left_up):
  1164. if is_horizontal:
  1165. if is_left_up:
  1166. return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
  1167. else:
  1168. return (y1_prime - y2 + 2) // 5 + x1_prime / 5000
  1169. else:
  1170. if is_left_up:
  1171. return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
  1172. else:
  1173. return (x1_prime - x2 + 2) // 5 + y1_prime / 5000
  1174. block_iou_threshold = 0.1
  1175. if block1["block_label"] in sub_title_labels:
  1176. block_iou_threshold = 0.5
  1177. if is_horizontal_1:
  1178. if match_block_iou >= block_iou_threshold:
  1179. left_up_distance = distance_(True, True)
  1180. right_down_distance = distance_(True, False)
  1181. if (
  1182. y2_prime <= y1
  1183. and left_up_distance <= left_up_title_text_distance
  1184. ):
  1185. left_up_title_text_distance = left_up_distance
  1186. left_up_title_text_index = j
  1187. left_up_title_text_direction = is_horizontal_2
  1188. elif (
  1189. y1_prime > y2
  1190. and right_down_distance < right_down_title_text_distance
  1191. ):
  1192. right_down_title_text_distance = right_down_distance
  1193. right_down_title_text_index = j
  1194. right_down_title_text_direction = is_horizontal_2
  1195. else:
  1196. if match_block_iou >= block_iou_threshold:
  1197. left_up_distance = distance_(False, True)
  1198. right_down_distance = distance_(False, False)
  1199. if (
  1200. x2_prime <= x1
  1201. and left_up_distance <= left_up_title_text_distance
  1202. ):
  1203. left_up_title_text_distance = left_up_distance
  1204. left_up_title_text_index = j
  1205. left_up_title_text_direction = is_horizontal_2
  1206. elif (
  1207. x1_prime > x2
  1208. and right_down_distance < right_down_title_text_distance
  1209. ):
  1210. right_down_title_text_distance = right_down_distance
  1211. right_down_title_text_index = j
  1212. right_down_title_text_direction = is_horizontal_2
  1213. height = bbox1[3] - bbox1[1]
  1214. width = bbox1[2] - bbox1[0]
  1215. title_text_weight = [0.8, 0.8]
  1216. title_text, sub_title, vision_footnote = [], [], []
  1217. def get_sub_category_(
  1218. title_text_direction,
  1219. title_text_index,
  1220. label,
  1221. is_left_up=True,
  1222. ):
  1223. direction_ = [1, 3] if is_left_up else [2, 4]
  1224. if (
  1225. title_text_direction == is_horizontal_1
  1226. and title_text_index != -1
  1227. and (label == "text" or label == "paragraph_title")
  1228. ):
  1229. bbox2 = blocks[title_text_index]["block_bbox"]
  1230. if is_horizontal_1:
  1231. height1 = bbox2[3] - bbox2[1]
  1232. width1 = bbox2[2] - bbox2[0]
  1233. if label == "text":
  1234. if (
  1235. _nearest_edge_distance(bbox1, bbox2)[0] <= 15
  1236. and block1["block_label"] in vision_labels
  1237. and width1 < width
  1238. and height1 < 0.5 * height
  1239. ):
  1240. blocks[title_text_index]["sub_label"] = "vision_footnote"
  1241. vision_footnote.append(bbox2)
  1242. elif (
  1243. height1 < height * title_text_weight[0]
  1244. and (width1 < width or width1 > 1.5 * width)
  1245. and block1["block_label"] in title_labels
  1246. ):
  1247. blocks[title_text_index]["sub_label"] = "title_text"
  1248. title_text.append((direction_[0], bbox2))
  1249. elif (
  1250. label == "paragraph_title"
  1251. and block1["block_label"] in sub_title_labels
  1252. ):
  1253. sub_title.append(bbox2)
  1254. else:
  1255. height1 = bbox2[3] - bbox2[1]
  1256. width1 = bbox2[2] - bbox2[0]
  1257. if label == "text":
  1258. if (
  1259. _nearest_edge_distance(bbox1, bbox2)[0] <= 15
  1260. and block1["block_label"] in vision_labels
  1261. and height1 < height
  1262. and width1 < 0.5 * width
  1263. ):
  1264. blocks[title_text_index]["sub_label"] = "vision_footnote"
  1265. vision_footnote.append(bbox2)
  1266. elif (
  1267. width1 < width * title_text_weight[1]
  1268. and block1["block_label"] in title_labels
  1269. ):
  1270. blocks[title_text_index]["sub_label"] = "title_text"
  1271. title_text.append((direction_[1], bbox2))
  1272. elif (
  1273. label == "paragraph_title"
  1274. and block1["block_label"] in sub_title_labels
  1275. ):
  1276. sub_title.append(bbox2)
  1277. if (
  1278. is_horizontal_1
  1279. and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
  1280. > height
  1281. ) or (
  1282. not is_horizontal_1
  1283. and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
  1284. > width
  1285. ):
  1286. if left_up_title_text_distance < right_down_title_text_distance:
  1287. get_sub_category_(
  1288. left_up_title_text_direction,
  1289. left_up_title_text_index,
  1290. blocks[left_up_title_text_index]["block_label"],
  1291. True,
  1292. )
  1293. else:
  1294. get_sub_category_(
  1295. right_down_title_text_direction,
  1296. right_down_title_text_index,
  1297. blocks[right_down_title_text_index]["block_label"],
  1298. False,
  1299. )
  1300. else:
  1301. get_sub_category_(
  1302. left_up_title_text_direction,
  1303. left_up_title_text_index,
  1304. blocks[left_up_title_text_index]["block_label"],
  1305. True,
  1306. )
  1307. get_sub_category_(
  1308. right_down_title_text_direction,
  1309. right_down_title_text_index,
  1310. blocks[right_down_title_text_index]["block_label"],
  1311. False,
  1312. )
  1313. if block1["block_label"] in title_labels:
  1314. if blocks[i].get("title_text") == []:
  1315. blocks[i]["title_text"] = title_text
  1316. if block1["block_label"] in sub_title_labels:
  1317. if blocks[i].get("sub_title") == []:
  1318. blocks[i]["sub_title"] = sub_title
  1319. if block1["block_label"] in vision_labels:
  1320. if blocks[i].get("vision_footnote") == []:
  1321. blocks[i]["vision_footnote"] = vision_footnote
  1322. return blocks, pre_cuts
  1323. def get_layout_ordering(
  1324. parsing_res_list: List[Dict[str, Any]],
  1325. no_mask_labels: List[str] = [],
  1326. ) -> None:
  1327. """
  1328. Process layout parsing results to remove overlapping bounding boxes
  1329. and assign an ordering index based on their positions.
  1330. Modifies:
  1331. The 'parsing_res_list' list by adding an 'index' to each block.
  1332. Args:
  1333. parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
  1334. no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
  1335. """
  1336. title_text_labels = ["doc_title"]
  1337. title_labels = ["doc_title", "paragraph_title"]
  1338. vision_labels = ["image", "table", "seal", "chart", "figure"]
  1339. vision_title_labels = ["table_title", "chart_title", "figure_title"]
  1340. parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
  1341. parsing_res_by_pre_cuts_list = []
  1342. if len(pre_cuts) > 0:
  1343. block_bboxes = [block["block_bbox"] for block in parsing_res_list]
  1344. for axis, cuts in pre_cuts.items():
  1345. axis_index = 1 if axis == "y" else 0
  1346. max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
  1347. intervals = []
  1348. prev = 0
  1349. for cut in sorted(cuts):
  1350. intervals.append((prev, cut))
  1351. prev = cut
  1352. intervals.append((prev, max_val))
  1353. for start, end in intervals:
  1354. mask = [
  1355. (bbox[axis_index] >= start) and (bbox[axis_index] < end)
  1356. for bbox in block_bboxes
  1357. ]
  1358. parsing_res_by_pre_cuts_list.append(
  1359. [parsing_res_list[i] for i, m in enumerate(mask) if m]
  1360. )
  1361. else:
  1362. parsing_res_by_pre_cuts_list = [parsing_res_list]
  1363. final_parsing_res_list = []
  1364. num_index = 0
  1365. num_sub_index = 0
  1366. for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
  1367. doc_flag = False
  1368. median_width = _get_text_median_width(parsing_res_by_pre_cuts)
  1369. parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
  1370. parsing_res_by_pre_cuts,
  1371. median_width,
  1372. no_mask_labels=no_mask_labels,
  1373. threshold=0.3,
  1374. )
  1375. # Convert bounding boxes to float and remove overlaps
  1376. (
  1377. double_text_blocks,
  1378. title_text_blocks,
  1379. title_blocks,
  1380. vision_blocks,
  1381. vision_title_blocks,
  1382. vision_footnote_blocks,
  1383. other_blocks,
  1384. ) = ([], [], [], [], [], [], [])
  1385. drop_indexes = []
  1386. for index, block in enumerate(parsing_res_by_pre_cuts):
  1387. label = block["sub_label"]
  1388. block["block_bbox"] = list(map(int, block["block_bbox"]))
  1389. if label == "doc_title":
  1390. doc_flag = True
  1391. if label in no_mask_labels:
  1392. if block["layout"] == "double":
  1393. double_text_blocks.append(block)
  1394. drop_indexes.append(index)
  1395. elif label == "title_text":
  1396. title_text_blocks.append(block)
  1397. drop_indexes.append(index)
  1398. elif label == "vision_footnote":
  1399. vision_footnote_blocks.append(block)
  1400. drop_indexes.append(index)
  1401. elif label in vision_title_labels:
  1402. vision_title_blocks.append(block)
  1403. drop_indexes.append(index)
  1404. elif label in title_labels:
  1405. title_blocks.append(block)
  1406. drop_indexes.append(index)
  1407. elif label in vision_labels:
  1408. vision_blocks.append(block)
  1409. drop_indexes.append(index)
  1410. else:
  1411. other_blocks.append(block)
  1412. drop_indexes.append(index)
  1413. for index in sorted(drop_indexes, reverse=True):
  1414. del parsing_res_by_pre_cuts[index]
  1415. if len(parsing_res_by_pre_cuts) > 0:
  1416. # single text label
  1417. if (
  1418. len(double_text_blocks) > len(parsing_res_by_pre_cuts)
  1419. or projection_direction
  1420. ):
  1421. parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
  1422. title_blocks = []
  1423. double_text_blocks = []
  1424. block_bboxes = [
  1425. block["block_bbox"] for block in parsing_res_by_pre_cuts
  1426. ]
  1427. block_bboxes.sort(
  1428. key=lambda x: (
  1429. x[0] // max(20, median_width),
  1430. x[1],
  1431. ),
  1432. )
  1433. block_bboxes = np.array(block_bboxes)
  1434. sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
  1435. else:
  1436. block_bboxes = [
  1437. block["block_bbox"] for block in parsing_res_by_pre_cuts
  1438. ]
  1439. block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
  1440. block_bboxes = np.array(block_bboxes)
  1441. sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
  1442. sorted_boxes = block_bboxes[sorted_indices].tolist()
  1443. for block in parsing_res_by_pre_cuts:
  1444. block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
  1445. block["sub_index"] = (
  1446. num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
  1447. )
  1448. def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
  1449. for block in input_blocks:
  1450. bbox = block["block_bbox"]
  1451. min_distance = float("inf")
  1452. min_distance_config = [
  1453. [float("inf"), float("inf")],
  1454. float("inf"),
  1455. float("inf"),
  1456. ] # for double text
  1457. nearest_gt_index = 0
  1458. for match_block in parsing_res_by_pre_cuts:
  1459. match_bbox = match_block["block_bbox"]
  1460. if distance_type == "nearest_iou_edge_distance":
  1461. distance, min_distance_config = _nearest_iou_edge_distance(
  1462. bbox,
  1463. match_bbox,
  1464. block["sub_label"],
  1465. vision_labels=vision_labels,
  1466. no_mask_labels=no_mask_labels,
  1467. median_width=median_width,
  1468. title_labels=title_labels,
  1469. title_text=block["title_text"],
  1470. sub_title=block["sub_title"],
  1471. min_distance_config=min_distance_config,
  1472. tolerance_len=10,
  1473. )
  1474. elif distance_type == "title_text":
  1475. if (
  1476. match_block["block_label"] in title_labels + ["abstract"]
  1477. and match_block["title_text"] != []
  1478. ):
  1479. iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
  1480. bbox,
  1481. match_block["title_text"][0][1],
  1482. )
  1483. iou_right_down = (
  1484. _calculate_overlap_area_div_minbox_area_ratio(
  1485. bbox,
  1486. match_block["title_text"][-1][1],
  1487. )
  1488. )
  1489. iou = 1 - max(iou_left_up, iou_right_down)
  1490. distance = _manhattan_distance(bbox, match_bbox) * iou
  1491. else:
  1492. distance = float("inf")
  1493. elif distance_type == "manhattan":
  1494. distance = _manhattan_distance(bbox, match_bbox)
  1495. elif distance_type == "vision_footnote":
  1496. if (
  1497. match_block["block_label"] in vision_labels
  1498. and match_block["vision_footnote"] != []
  1499. ):
  1500. iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
  1501. bbox,
  1502. match_block["vision_footnote"][0],
  1503. )
  1504. iou_right_down = (
  1505. _calculate_overlap_area_div_minbox_area_ratio(
  1506. bbox,
  1507. match_block["vision_footnote"][-1],
  1508. )
  1509. )
  1510. iou = 1 - max(iou_left_up, iou_right_down)
  1511. distance = _manhattan_distance(bbox, match_bbox) * iou
  1512. else:
  1513. distance = float("inf")
  1514. elif distance_type == "vision_body":
  1515. if (
  1516. match_block["block_label"] in vision_title_labels
  1517. and block["vision_footnote"] != []
  1518. ):
  1519. iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
  1520. match_bbox,
  1521. block["vision_footnote"][0],
  1522. )
  1523. iou_right_down = (
  1524. _calculate_overlap_area_div_minbox_area_ratio(
  1525. match_bbox,
  1526. block["vision_footnote"][-1],
  1527. )
  1528. )
  1529. iou = 1 - max(iou_left_up, iou_right_down)
  1530. distance = _manhattan_distance(bbox, match_bbox) * iou
  1531. else:
  1532. distance = float("inf")
  1533. # when reference block cross mulitple columns, its order should be after the blocks above it.
  1534. elif distance_type == "append":
  1535. if match_bbox[3] <= bbox[1]:
  1536. distance = -(match_bbox[2] * 10 + match_bbox[3])
  1537. else:
  1538. distance = float("inf")
  1539. else:
  1540. raise NotImplementedError
  1541. if distance < min_distance:
  1542. min_distance = distance
  1543. if is_add_index:
  1544. nearest_gt_index = match_block.get("index", 999)
  1545. else:
  1546. nearest_gt_index = match_block.get("sub_index", 999)
  1547. if is_add_index:
  1548. block["index"] = nearest_gt_index
  1549. else:
  1550. block["sub_index"] = nearest_gt_index
  1551. parsing_res_by_pre_cuts.append(block)
  1552. # double text label
  1553. double_text_blocks.sort(
  1554. key=lambda x: (
  1555. x["block_bbox"][1] // 10,
  1556. x["block_bbox"][0] // median_width,
  1557. x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
  1558. ),
  1559. )
  1560. # filter the reference blocks from all blocks that cross mulitple columns.
  1561. # they should be ordered using "append".
  1562. double_text_reference_blocks = []
  1563. i = 0
  1564. while i < len(double_text_blocks):
  1565. if double_text_blocks[i]["block_label"] == "reference":
  1566. double_text_reference_blocks.append(double_text_blocks.pop(i))
  1567. else:
  1568. i += 1
  1569. nearest_match_(
  1570. double_text_blocks,
  1571. distance_type="nearest_iou_edge_distance",
  1572. )
  1573. nearest_match_(
  1574. double_text_reference_blocks,
  1575. distance_type="append",
  1576. )
  1577. parsing_res_by_pre_cuts.sort(
  1578. key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
  1579. )
  1580. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1581. block["index"] = num_index + idx + 1
  1582. block["sub_index"] = num_sub_index + idx + 1
  1583. # title label
  1584. title_blocks.sort(
  1585. key=lambda x: (
  1586. x["block_bbox"][1] // 10,
  1587. x["block_bbox"][0] // median_width,
  1588. x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
  1589. ),
  1590. )
  1591. nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
  1592. if doc_flag:
  1593. text_sort_labels = ["doc_title"]
  1594. text_label_priority = {
  1595. label: priority for priority, label in enumerate(text_sort_labels)
  1596. }
  1597. doc_titles = []
  1598. for i, block in enumerate(parsing_res_by_pre_cuts):
  1599. if block["block_label"] == "doc_title":
  1600. doc_titles.append(
  1601. (i, block["block_bbox"][1], block["block_bbox"][0]),
  1602. )
  1603. doc_titles.sort(key=lambda x: (x[1], x[2]))
  1604. first_doc_title_index = doc_titles[0][0]
  1605. parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
  1606. parsing_res_by_pre_cuts.sort(
  1607. key=lambda x: (
  1608. x["index"],
  1609. text_label_priority.get(x["block_label"], 9999),
  1610. x["block_bbox"][1],
  1611. x["block_bbox"][0],
  1612. ),
  1613. )
  1614. else:
  1615. parsing_res_by_pre_cuts.sort(
  1616. key=lambda x: (
  1617. x["index"],
  1618. x["block_bbox"][1],
  1619. x["block_bbox"][0],
  1620. ),
  1621. )
  1622. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1623. block["index"] = num_index + idx + 1
  1624. block["sub_index"] = num_sub_index + idx + 1
  1625. # title-text label
  1626. nearest_match_(title_text_blocks, distance_type="title_text")
  1627. def hor_tb_and_ver_lr(x):
  1628. input_bbox = x["block_bbox"]
  1629. is_horizontal = _get_bbox_direction(input_bbox)
  1630. if is_horizontal:
  1631. return input_bbox[1]
  1632. else:
  1633. return input_bbox[0]
  1634. parsing_res_by_pre_cuts.sort(
  1635. key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
  1636. )
  1637. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1638. block["index"] = num_index + idx + 1
  1639. block["sub_index"] = num_sub_index + idx + 1
  1640. # image,figure,chart,seal label
  1641. nearest_match_(
  1642. vision_blocks,
  1643. distance_type="nearest_iou_edge_distance",
  1644. is_add_index=False,
  1645. )
  1646. parsing_res_by_pre_cuts.sort(
  1647. key=lambda x: (
  1648. x["sub_index"],
  1649. x["block_bbox"][1],
  1650. x["block_bbox"][0],
  1651. ),
  1652. )
  1653. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1654. block["sub_index"] = num_sub_index + idx + 1
  1655. # image,figure,chart,seal title label
  1656. nearest_match_(
  1657. vision_title_blocks,
  1658. distance_type="nearest_iou_edge_distance",
  1659. is_add_index=False,
  1660. )
  1661. parsing_res_by_pre_cuts.sort(
  1662. key=lambda x: (
  1663. x["sub_index"],
  1664. x["block_bbox"][1],
  1665. x["block_bbox"][0],
  1666. ),
  1667. )
  1668. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1669. block["sub_index"] = num_sub_index + idx + 1
  1670. # vision footnote label
  1671. nearest_match_(
  1672. vision_footnote_blocks,
  1673. distance_type="vision_footnote",
  1674. is_add_index=False,
  1675. )
  1676. text_label_priority = {"vision_footnote": 9999}
  1677. parsing_res_by_pre_cuts.sort(
  1678. key=lambda x: (
  1679. x["sub_index"],
  1680. text_label_priority.get(x["sub_label"], 0),
  1681. x["block_bbox"][1],
  1682. x["block_bbox"][0],
  1683. ),
  1684. )
  1685. for idx, block in enumerate(parsing_res_by_pre_cuts):
  1686. block["sub_index"] = num_sub_index + idx + 1
  1687. # header、footnote、header_image... label
  1688. nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
  1689. # add all parsing result
  1690. final_parsing_res_list.extend(parsing_res_by_pre_cuts)
  1691. # update num index
  1692. num_sub_index += len(parsing_res_by_pre_cuts)
  1693. for parsing_res in parsing_res_by_pre_cuts:
  1694. if parsing_res.get("index"):
  1695. num_index += 1
  1696. parsing_res_list = [
  1697. {
  1698. "block_label": parsing_res["block_label"],
  1699. "block_content": parsing_res["block_content"],
  1700. "block_bbox": parsing_res["block_bbox"],
  1701. "block_image": parsing_res.get("block_image", None),
  1702. "sub_label": parsing_res["sub_label"],
  1703. "sub_index": parsing_res["sub_index"],
  1704. "index": parsing_res.get("index", None),
  1705. "seg_start_coordinate": parsing_res.get(
  1706. "seg_start_coordinate", float("inf")
  1707. ),
  1708. "seg_end_coordinate": parsing_res.get("seg_end_coordinate", float("-inf")),
  1709. "num_of_lines": parsing_res.get("num_of_lines", 1),
  1710. }
  1711. for parsing_res in final_parsing_res_list
  1712. ]
  1713. return parsing_res_list
  1714. def _manhattan_distance(
  1715. point1: Tuple[float, float],
  1716. point2: Tuple[float, float],
  1717. weight_x: float = 1.0,
  1718. weight_y: float = 1.0,
  1719. ) -> float:
  1720. """
  1721. Calculate the weighted Manhattan distance between two points.
  1722. Args:
  1723. point1 (Tuple[float, float]): The first point as (x, y).
  1724. point2 (Tuple[float, float]): The second point as (x, y).
  1725. weight_x (float): The weight for the x-axis distance. Default is 1.0.
  1726. weight_y (float): The weight for the y-axis distance. Default is 1.0.
  1727. Returns:
  1728. float: The weighted Manhattan distance between the two points.
  1729. """
  1730. return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
  1731. def _calculate_horizontal_distance(
  1732. input_bbox: List[int],
  1733. match_bbox: List[int],
  1734. height: int,
  1735. disperse: int,
  1736. title_text: List[Tuple[int, List[int]]],
  1737. ) -> float:
  1738. """
  1739. Calculate the horizontal distance between two bounding boxes, considering title text adjustments.
  1740. Args:
  1741. input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
  1742. match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
  1743. height (int): The height of the input bounding box used for normalization.
  1744. disperse (int): The dispersion factor used to normalize the horizontal distance.
  1745. title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
  1746. Format: [(position_indicator, [x1, y1, x2, y2]), ...].
  1747. Returns:
  1748. float: The calculated horizontal distance taking into account the title text adjustments.
  1749. """
  1750. x1, y1, x2, y2 = input_bbox
  1751. x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
  1752. # Determine vertical distance adjustment based on title text
  1753. if y2 < y1_prime:
  1754. if title_text and title_text[-1][0] == 2:
  1755. y2 += title_text[-1][1][3] - title_text[-1][1][1]
  1756. vertical_adjustment = (y1_prime - y2) * 0.5
  1757. else:
  1758. if title_text and title_text[0][0] == 1:
  1759. y1 -= title_text[0][1][3] - title_text[0][1][1]
  1760. vertical_adjustment = y1 - y2_prime
  1761. # Calculate horizontal distance with adjustments
  1762. horizontal_distance = (
  1763. abs(x2_prime - x1) // disperse
  1764. + vertical_adjustment // height
  1765. + vertical_adjustment / 5000
  1766. )
  1767. return horizontal_distance
  1768. def _calculate_vertical_distance(
  1769. input_bbox: List[int],
  1770. match_bbox: List[int],
  1771. width: int,
  1772. disperse: int,
  1773. title_text: List[Tuple[int, List[int]]],
  1774. ) -> float:
  1775. """
  1776. Calculate the vertical distance between two bounding boxes, considering title text adjustments.
  1777. Args:
  1778. input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
  1779. match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
  1780. width (int): The width of the input bounding box used for normalization.
  1781. disperse (int): The dispersion factor used to normalize the vertical distance.
  1782. title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
  1783. Format: [(position_indicator, [x1, y1, x2, y2]), ...].
  1784. Returns:
  1785. float: The calculated vertical distance taking into account the title text adjustments.
  1786. """
  1787. x1, y1, x2, y2 = input_bbox
  1788. x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
  1789. # Determine horizontal distance adjustment based on title text
  1790. if x1 > x2_prime:
  1791. if title_text and title_text[0][0] == 3:
  1792. x1 -= title_text[0][1][2] - title_text[0][1][0]
  1793. horizontal_adjustment = (x1 - x2_prime) * 0.5
  1794. else:
  1795. if title_text and title_text[-1][0] == 4:
  1796. x2 += title_text[-1][1][2] - title_text[-1][1][0]
  1797. horizontal_adjustment = x1_prime - x2
  1798. # Calculate vertical distance with adjustments
  1799. vertical_distance = (
  1800. abs(y2_prime - y1) // disperse
  1801. + horizontal_adjustment // width
  1802. + horizontal_adjustment / 5000
  1803. )
  1804. return vertical_distance
  1805. def _nearest_edge_distance(
  1806. input_bbox: List[int],
  1807. match_bbox: List[int],
  1808. weight: List[float] = [1.0, 1.0, 1.0, 1.0],
  1809. label: str = "text",
  1810. no_mask_labels: List[str] = [],
  1811. min_edge_distance_config: List[float] = [],
  1812. tolerance_len: float = 10.0,
  1813. ) -> Tuple[float, List[float]]:
  1814. """
  1815. Calculate the nearest edge distance between two bounding boxes, considering directional weights.
  1816. Args:
  1817. input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
  1818. match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
  1819. weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
  1820. label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
  1821. no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
  1822. min_edge_distance_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
  1823. Defaults to [float('inf'), float('inf')].
  1824. tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.
  1825. Returns:
  1826. Tuple[float, List[float]]: A tuple containing:
  1827. - The calculated minimum edge distance between the bounding boxes.
  1828. - A list with the minimum edge distances in the x and y directions.
  1829. """
  1830. match_bbox_iou = _calculate_overlap_area_div_minbox_area_ratio(
  1831. input_bbox,
  1832. match_bbox,
  1833. )
  1834. if match_bbox_iou > 0 and label not in no_mask_labels:
  1835. return 0, [0, 0]
  1836. if not min_edge_distance_config:
  1837. min_edge_distance_config = [float("inf"), float("inf")]
  1838. min_edge_distance_x, min_edge_distance_y = min_edge_distance_config
  1839. x1, y1, x2, y2 = input_bbox
  1840. x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
  1841. direction_num = 0
  1842. distance_x = float("inf")
  1843. distance_y = float("inf")
  1844. distance = [float("inf")] * 4
  1845. # input_bbox is to the left of match_bbox
  1846. if x2 < x1_prime:
  1847. direction_num += 1
  1848. distance[0] = x1_prime - x2
  1849. if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
  1850. distance_x = min_edge_distance_x * weight[0]
  1851. else:
  1852. distance_x = distance[0] * weight[0]
  1853. # input_bbox is to the right of match_bbox
  1854. elif x1 > x2_prime:
  1855. direction_num += 1
  1856. distance[1] = x1 - x2_prime
  1857. if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
  1858. distance_x = min_edge_distance_x * weight[1]
  1859. else:
  1860. distance_x = distance[1] * weight[1]
  1861. elif match_bbox_iou > 0:
  1862. distance[0] = 0
  1863. distance_x = 0
  1864. # input_bbox is above match_bbox
  1865. if y2 < y1_prime:
  1866. direction_num += 1
  1867. distance[2] = y1_prime - y2
  1868. if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
  1869. distance_y = min_edge_distance_y * weight[2]
  1870. else:
  1871. distance_y = distance[2] * weight[2]
  1872. if label in no_mask_labels:
  1873. distance_y = max(0.1, distance_y) * 10 # for abstract
  1874. # input_bbox is below match_bbox
  1875. elif y1 > y2_prime:
  1876. direction_num += 1
  1877. distance[3] = y1 - y2_prime
  1878. if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
  1879. distance_y = min_edge_distance_y * weight[3]
  1880. else:
  1881. distance_y = distance[3] * weight[3]
  1882. elif match_bbox_iou > 0:
  1883. distance[2] = 0
  1884. distance_y = 0
  1885. if direction_num == 2:
  1886. return (distance_x + distance_y), [
  1887. min(distance[0], distance[1]),
  1888. min(distance[2], distance[3]),
  1889. ]
  1890. else:
  1891. return min(distance_x, distance_y), [
  1892. min(distance[0], distance[1]),
  1893. min(distance[2], distance[3]),
  1894. ]
  1895. def _get_weights(label, horizontal):
  1896. """Define weights based on the label and orientation."""
  1897. if label == "doc_title":
  1898. return (
  1899. [1, 0.1, 0.1, 1] if horizontal else [0.2, 0.1, 1, 1]
  1900. ) # left-down , right-left
  1901. elif label in [
  1902. "paragraph_title",
  1903. "table_title",
  1904. "abstract",
  1905. "image",
  1906. "seal",
  1907. "chart",
  1908. "figure",
  1909. ]:
  1910. return [1, 1, 0.1, 1] # down
  1911. else:
  1912. return [1, 1, 1, 0.1] # up
  1913. def _nearest_iou_edge_distance(
  1914. input_bbox: List[int],
  1915. match_bbox: List[int],
  1916. label: str,
  1917. vision_labels: List[str],
  1918. no_mask_labels: List[str],
  1919. median_width: int = -1,
  1920. title_labels: List[str] = [],
  1921. title_text: List[Tuple[int, List[int]]] = [],
  1922. sub_title: List[List[int]] = [],
  1923. min_distance_config: List[float] = [],
  1924. tolerance_len: float = 10.0,
  1925. ) -> Tuple[float, List[float]]:
  1926. """
  1927. Calculate the nearest IOU edge distance between two bounding boxes, considering label types, title adjustments, and minimum distance configurations.
  1928. This function computes the edge distance between two bounding boxes while considering their overlap (IOU) and various adjustments based on label types,
  1929. title text, and subtitle information. It also applies minimum distance configurations and tolerance adjustments.
  1930. Args:
  1931. input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
  1932. match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
  1933. label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
  1934. vision_labels (List[str]): List of labels for vision-related objects (e.g., images, icons).
  1935. no_mask_labels (List[str]): Labels for which no masking is applied when calculating edge distances.
  1936. median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
  1937. title_labels (List[str], optional): Labels that indicate the object is a title. Defaults to an empty list.
  1938. title_text (List[Tuple[int, List[int]]], optional): Text content associated with title labels, in the format [(position_indicator, [x1, y1, x2, y2]), ...].
  1939. sub_title (List[List[int]], optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
  1940. min_distance_config (List[float], optional): Configuration for minimum distances [min_edge_distance_config, up_edge_distances_config, total_distance].
  1941. tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.0.
  1942. Returns:
  1943. Tuple[float, List[float]]: A tuple containing:
  1944. - The calculated distance considering IOU and adjustments.
  1945. - The updated minimum distance configuration.
  1946. """
  1947. x1, y1, x2, y2 = input_bbox
  1948. x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
  1949. min_edge_distance_config, up_edge_distances_config, total_distance = (
  1950. min_distance_config
  1951. )
  1952. iou_distance = 0
  1953. if label in vision_labels:
  1954. horizontal1 = horizontal2 = True
  1955. else:
  1956. horizontal1 = _get_bbox_direction(input_bbox)
  1957. horizontal2 = _get_bbox_direction(match_bbox, 3)
  1958. if (
  1959. horizontal1 != horizontal2
  1960. or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
  1961. ):
  1962. iou_distance = 1
  1963. if label == "doc_title":
  1964. # Calculate distance for titles
  1965. disperse = max(1, median_width)
  1966. tolerance_len = max(tolerance_len, disperse)
  1967. # Adjust input_bbox based on sub_title
  1968. if sub_title:
  1969. for sub in sub_title:
  1970. x1_, y1_, x2_, y2_ = sub
  1971. x1, y1, x2, y2 = (
  1972. min(x1, x1_),
  1973. min(y1, y1_),
  1974. min(x2, x2_),
  1975. max(y2, y2_),
  1976. )
  1977. input_bbox = [x1, y1, x2, y2]
  1978. if title_text:
  1979. for sub in title_text:
  1980. x1_, y1_, x2_, y2_ = sub[1]
  1981. if horizontal1:
  1982. x1, y1, x2, y2 = (
  1983. min(x1, x1_),
  1984. min(y1, y1_),
  1985. min(x2, x2_),
  1986. max(y2, y2_),
  1987. )
  1988. else:
  1989. x1, y1, x2, y2 = (
  1990. min(x1, x1_),
  1991. min(y1, y1_),
  1992. max(x2, x2_),
  1993. min(y2, y2_),
  1994. )
  1995. input_bbox = [x1, y1, x2, y2]
  1996. # Calculate edge distance
  1997. weight = _get_weights(label, horizontal1)
  1998. if label == "abstract":
  1999. tolerance_len *= 2
  2000. edge_distance, edge_distance_config = _nearest_edge_distance(
  2001. input_bbox,
  2002. match_bbox,
  2003. weight,
  2004. label=label,
  2005. no_mask_labels=no_mask_labels,
  2006. min_edge_distance_config=min_edge_distance_config,
  2007. tolerance_len=tolerance_len,
  2008. )
  2009. # Weights for combining distances
  2010. iou_edge_weight = [10**8, 10**4, 1, 0.0001]
  2011. # Calculate up and left edge distances
  2012. up_edge_distance = y1_prime
  2013. left_edge_distance = x1_prime
  2014. if (
  2015. label in no_mask_labels or label in title_labels or label in vision_labels
  2016. ) and y1 > y2_prime:
  2017. up_edge_distance = -y2_prime
  2018. left_edge_distance = -x2_prime
  2019. min_up_edge_distance = up_edge_distances_config
  2020. if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
  2021. up_edge_distance = min_up_edge_distance
  2022. # Calculate total distance
  2023. distance = (
  2024. iou_distance * iou_edge_weight[0]
  2025. + edge_distance * iou_edge_weight[1]
  2026. + up_edge_distance * iou_edge_weight[2]
  2027. + left_edge_distance * iou_edge_weight[3]
  2028. )
  2029. # Update minimum distance configuration if a smaller distance is found
  2030. if total_distance > distance:
  2031. edge_distance_config = [
  2032. edge_distance_config[0],
  2033. edge_distance_config[1],
  2034. ]
  2035. min_distance_config = [
  2036. edge_distance_config,
  2037. up_edge_distance,
  2038. distance,
  2039. ]
  2040. return distance, min_distance_config
  2041. def get_show_color(label: str) -> Tuple:
  2042. label_colors = {
  2043. # Medium Blue (from 'titles_list')
  2044. "paragraph_title": (102, 102, 255, 100),
  2045. "doc_title": (255, 248, 220, 100), # Cornsilk
  2046. # Light Yellow (from 'tables_caption_list')
  2047. "table_title": (255, 255, 102, 100),
  2048. # Sky Blue (from 'imgs_caption_list')
  2049. "figure_title": (102, 178, 255, 100),
  2050. "chart_title": (221, 160, 221, 100), # Plum
  2051. "vision_footnote": (144, 238, 144, 100), # Light Green
  2052. # Deep Purple (from 'texts_list')
  2053. "text": (153, 0, 76, 100),
  2054. # Bright Green (from 'interequations_list')
  2055. "formula": (0, 255, 0, 100),
  2056. "abstract": (255, 239, 213, 100), # Papaya Whip
  2057. # Medium Green (from 'lists_list' and 'indexs_list')
  2058. "content": (40, 169, 92, 100),
  2059. # Neutral Gray (from 'dropped_bbox_list')
  2060. "seal": (158, 158, 158, 100),
  2061. # Olive Yellow (from 'tables_body_list')
  2062. "table": (204, 204, 0, 100),
  2063. # Bright Green (from 'imgs_body_list')
  2064. "image": (153, 255, 51, 100),
  2065. # Bright Green (from 'imgs_body_list')
  2066. "figure": (153, 255, 51, 100),
  2067. "chart": (216, 191, 216, 100), # Thistle
  2068. # Pale Yellow-Green (from 'tables_footnote_list')
  2069. "reference": (229, 255, 204, 100),
  2070. "algorithm": (255, 250, 240, 100), # Floral White
  2071. }
  2072. default_color = (158, 158, 158, 100)
  2073. return label_colors.get(label, default_color)