mcol_sort.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. """
  2. This is an advanced PyMuPDF utility for detecting multi-column pages.
  3. It can be used in a shell script, or its main function can be imported and
  4. invoked as descript below.
  5. Features
  6. ---------
  7. - Identify text belonging to (a variable number of) columns on the page.
  8. - Text with different background color is handled separately, allowing for
  9. easier treatment of side remarks, comment boxes, etc.
  10. - Uses text block detection capability to identify text blocks and
  11. uses the block bboxes as primary structuring principle.
  12. - Supports ignoring footers via a footer margin parameter.
  13. - Returns re-created text boundary boxes (integer coordinates), sorted ascending
  14. by the top, then by the left coordinates.
  15. Restrictions
  16. -------------
  17. - Only supporting horizontal, left-to-right text
  18. - Returns a list of text boundary boxes - not the text itself. The caller is
  19. expected to extract text from within the returned boxes.
  20. - Text written above images is ignored altogether (option).
  21. - This utility works as expected in most cases. The following situation cannot
  22. be handled correctly:
  23. * overlapping (non-disjoint) text blocks
  24. * image captions are not recognized and are handled like normal text
  25. Usage
  26. ------
  27. - As a CLI shell command use
  28. python multi_column.py input.pdf footer_margin
  29. Where footer margin is the height of the bottom stripe to ignore on each page.
  30. This code is intended to be modified according to your need.
  31. - Use in a Python script as follows:
  32. ----------------------------------------------------------------------------------
  33. from multi_column import column_boxes
  34. # for each page execute
  35. bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
  36. # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
  37. # then x0 coordinates. Their text content can be extracted by all PyMuPDF
  38. # get_text() variants, like for instance the following:
  39. for rect in bboxes:
  40. print(page.get_text(clip=rect, sort=True))
  41. ----------------------------------------------------------------------------------
  42. """
  43. import sys
  44. from magic_pdf.libs.commons import fitz
  45. def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
  46. """Determine bboxes which wrap a column."""
  47. paths = page.get_drawings()
  48. bboxes = []
  49. # path rectangles
  50. path_rects = []
  51. # image bboxes
  52. img_bboxes = []
  53. # bboxes of non-horizontal text
  54. # avoid when expanding horizontal text boxes
  55. vert_bboxes = []
  56. # compute relevant page area
  57. clip = +page.rect
  58. clip.y1 -= footer_margin # Remove footer area
  59. clip.y0 += header_margin # Remove header area
  60. def can_extend(temp, bb, bboxlist):
  61. """Determines whether rectangle 'temp' can be extended by 'bb'
  62. without intersecting any of the rectangles contained in 'bboxlist'.
  63. Items of bboxlist may be None if they have been removed.
  64. Returns:
  65. True if 'temp' has no intersections with items of 'bboxlist'.
  66. """
  67. for b in bboxlist:
  68. if not intersects_bboxes(temp, vert_bboxes) and (
  69. b == None or b == bb or (temp & b).is_empty
  70. ):
  71. continue
  72. return False
  73. return True
  74. def in_bbox(bb, bboxes):
  75. """Return 1-based number if a bbox contains bb, else return 0."""
  76. for i, bbox in enumerate(bboxes):
  77. if bb in bbox:
  78. return i + 1
  79. return 0
  80. def intersects_bboxes(bb, bboxes):
  81. """Return True if a bbox intersects bb, else return False."""
  82. for bbox in bboxes:
  83. if not (bb & bbox).is_empty:
  84. return True
  85. return False
  86. def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
  87. """Extend a bbox to the right page border.
  88. Whenever there is no text to the right of a bbox, enlarge it up
  89. to the right page border.
  90. Args:
  91. bboxes: (list[IRect]) bboxes to check
  92. width: (int) page width
  93. path_bboxes: (list[IRect]) bboxes with a background color
  94. vert_bboxes: (list[IRect]) bboxes with vertical text
  95. img_bboxes: (list[IRect]) bboxes of images
  96. Returns:
  97. Potentially modified bboxes.
  98. """
  99. for i, bb in enumerate(bboxes):
  100. # do not extend text with background color
  101. if in_bbox(bb, path_bboxes):
  102. continue
  103. # do not extend text in images
  104. if in_bbox(bb, img_bboxes):
  105. continue
  106. # temp extends bb to the right page border
  107. temp = +bb
  108. temp.x1 = width
  109. # do not cut through colored background or images
  110. if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
  111. continue
  112. # also, do not intersect other text bboxes
  113. check = can_extend(temp, bb, bboxes)
  114. if check:
  115. bboxes[i] = temp # replace with enlarged bbox
  116. return [b for b in bboxes if b != None]
  117. def clean_nblocks(nblocks):
  118. """Do some elementary cleaning."""
  119. # 1. remove any duplicate blocks.
  120. blen = len(nblocks)
  121. if blen < 2:
  122. return nblocks
  123. start = blen - 1
  124. for i in range(start, -1, -1):
  125. bb1 = nblocks[i]
  126. bb0 = nblocks[i - 1]
  127. if bb0 == bb1:
  128. del nblocks[i]
  129. # 2. repair sequence in special cases:
  130. # consecutive bboxes with almost same bottom value are sorted ascending
  131. # by x-coordinate.
  132. y1 = nblocks[0].y1 # first bottom coordinate
  133. i0 = 0 # its index
  134. i1 = -1 # index of last bbox with same bottom
  135. # Iterate over bboxes, identifying segments with approx. same bottom value.
  136. # Replace every segment by its sorted version.
  137. for i in range(1, len(nblocks)):
  138. b1 = nblocks[i]
  139. if abs(b1.y1 - y1) > 10: # different bottom
  140. if i1 > i0: # segment length > 1? Sort it!
  141. nblocks[i0 : i1 + 1] = sorted(
  142. nblocks[i0 : i1 + 1], key=lambda b: b.x0
  143. )
  144. y1 = b1.y1 # store new bottom value
  145. i0 = i # store its start index
  146. i1 = i # store current index
  147. if i1 > i0: # segment waiting to be sorted
  148. nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
  149. return nblocks
  150. # extract vector graphics
  151. for p in paths:
  152. path_rects.append(p["rect"].irect)
  153. path_bboxes = path_rects
  154. # sort path bboxes by ascending top, then left coordinates
  155. path_bboxes.sort(key=lambda b: (b.y0, b.x0))
  156. # bboxes of images on page, no need to sort them
  157. for item in page.get_images():
  158. img_bboxes.extend(page.get_image_rects(item[0]))
  159. # blocks of text on page
  160. blocks = page.get_text(
  161. "dict",
  162. flags=fitz.TEXTFLAGS_TEXT,
  163. clip=clip,
  164. )["blocks"]
  165. # Make block rectangles, ignoring non-horizontal text
  166. for b in blocks:
  167. bbox = fitz.IRect(b["bbox"]) # bbox of the block
  168. # ignore text written upon images
  169. if no_image_text and in_bbox(bbox, img_bboxes):
  170. continue
  171. # confirm first line to be horizontal
  172. line0 = b["lines"][0] # get first line
  173. if line0["dir"] != (1, 0): # only accept horizontal text
  174. vert_bboxes.append(bbox)
  175. continue
  176. srect = fitz.EMPTY_IRECT()
  177. for line in b["lines"]:
  178. lbbox = fitz.IRect(line["bbox"])
  179. text = "".join([s["text"].strip() for s in line["spans"]])
  180. if len(text) > 1:
  181. srect |= lbbox
  182. bbox = +srect
  183. if not bbox.is_empty:
  184. bboxes.append(bbox)
  185. # Sort text bboxes by ascending background, top, then left coordinates
  186. bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
  187. # Extend bboxes to the right where possible
  188. bboxes = extend_right(
  189. bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
  190. )
  191. # immediately return of no text found
  192. if bboxes == []:
  193. return []
  194. # --------------------------------------------------------------------
  195. # Join bboxes to establish some column structure
  196. # --------------------------------------------------------------------
  197. # the final block bboxes on page
  198. nblocks = [bboxes[0]] # pre-fill with first bbox
  199. bboxes = bboxes[1:] # remaining old bboxes
  200. for i, bb in enumerate(bboxes): # iterate old bboxes
  201. check = False # indicates unwanted joins
  202. # check if bb can extend one of the new blocks
  203. for j in range(len(nblocks)):
  204. nbb = nblocks[j] # a new block
  205. # never join across columns
  206. if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
  207. continue
  208. # never join across different background colors
  209. if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
  210. continue
  211. temp = bb | nbb # temporary extension of new block
  212. check = can_extend(temp, nbb, nblocks)
  213. if check == True:
  214. break
  215. if not check: # bb cannot be used to extend any of the new bboxes
  216. nblocks.append(bb) # so add it to the list
  217. j = len(nblocks) - 1 # index of it
  218. temp = nblocks[j] # new bbox added
  219. # check if some remaining bbox is contained in temp
  220. check = can_extend(temp, bb, bboxes)
  221. if check == False:
  222. nblocks.append(bb)
  223. else:
  224. nblocks[j] = temp
  225. bboxes[i] = None
  226. # do some elementary cleaning
  227. nblocks = clean_nblocks(nblocks)
  228. # return identified text bboxes
  229. return nblocks
  230. if __name__ == "__main__":
  231. """Only for debugging purposes, currently.
  232. Draw red borders around the returned text bboxes and insert
  233. the bbox number.
  234. Then save the file under the name "input-blocks.pdf".
  235. """
  236. # get the file name
  237. filename = sys.argv[1]
  238. # check if footer margin is given
  239. if len(sys.argv) > 2:
  240. footer_margin = int(sys.argv[2])
  241. else: # use default vaue
  242. footer_margin = 50
  243. # check if header margin is given
  244. if len(sys.argv) > 3:
  245. header_margin = int(sys.argv[3])
  246. else: # use default vaue
  247. header_margin = 50
  248. # open document
  249. doc = fitz.open(filename)
  250. # iterate over the pages
  251. for page in doc:
  252. # remove any geometry issues
  253. page.wrap_contents()
  254. # get the text bboxes
  255. bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
  256. # prepare a canvas to draw rectangles and text
  257. shape = page.new_shape()
  258. # iterate over the bboxes
  259. for i, rect in enumerate(bboxes):
  260. shape.draw_rect(rect) # draw a border
  261. # write sequence number
  262. shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
  263. # finish drawing / text with color red
  264. shape.finish(color=fitz.pdfcolor["red"])
  265. shape.commit() # store to the page
  266. # save document with text bboxes
  267. doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))