preproc_2_parasplit_example.json 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. {
  2. "page_0":{
  3. "para_blocks": [
  4. {
  5. "block_id": 0,
  6. "bbox": [39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082],
  7. "text": "IOP Conference Series: Earth and Environmental Science",
  8. "dir": [1.0, 0.0],
  9. "X0": 39.0,
  10. "X1": 347.1359558105469,
  11. "avg_char_width": 6.4194990793863935,
  12. "avg_char_height": 16.48800277709961,
  13. "block_font_type": "Helvetica",
  14. "block_font_size": 12.0,
  15. "is_segmented": 1,
  16. "paras": [
  17. {
  18. "para_id": 0,
  19. "bbox": [39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082],
  20. "text": "IOP Conference Series: Earth and Environmental Science",
  21. "is_matched": 1,
  22. "is_title": 0,
  23. "font_type": "Helvetica",
  24. "font_size": 12.0,
  25. "font_color": 0,
  26. "neighbor_paras": [null, null]
  27. }
  28. ],
  29. "bboxes_para": [[39.0, 34.719993591308594, 347.1359558105469, 51.2079963684082]]
  30. },
  31. {
  32. "block_id": 1,
  33. "bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
  34. "text": "PAPER • OPEN ACCESS",
  35. "dir": [1.0, 0.0],
  36. "X0": 39.0,
  37. "X1": 143.67001342773438,
  38. "avg_char_width": 6.541875839233398,
  39. "avg_char_height": 12.392997741699219,
  40. "block_font_type": "Helvetica-Bold",
  41. "block_font_size": 9.0,
  42. "is_segmented": 1,
  43. "paras": [
  44. {
  45. "para_id": 0,
  46. "bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
  47. "text": "PAPER • OPEN ACCESS",
  48. "is_matched": 1,
  49. "is_title": 0,
  50. "font_type": "Helvetica-Bold",
  51. "font_size": 9.0,
  52. "font_color": 0,
  53. "neighbor_paras": [null, null]
  54. },
  55. {
  56. "para_id": 1,
  57. "bbox": [39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625],
  58. "text": "PAPER • OPEN ACCESS",
  59. "is_matched": 1,
  60. "is_title": 0,
  61. "font_type": "Helvetica-Bold",
  62. "font_size": 9.0,
  63. "font_color": 0,
  64. "neighbor_paras": [null, null]
  65. }
  66. ],
  67. "bboxes_para": [[39.0, 111.38001251220703, 143.67001342773438, 123.77301025390625]]
  68. }
  69. ],
  70. "preproc_blocks":[ //这里已经把重叠,页眉,页脚,垂直,旋转,水印,图片,表格删掉了
  71. {
  72. "number": 0,
  73. "type": 0,
  74. "bbox": [
  75. 428.93170166015625,
  76. 744.921142578125,
  77. 541.5675048828125,
  78. 757.8131713867188
  79. ],
  80. "lines": [
  81. {
  82. "spans": [
  83. {
  84. "size": 11.0,
  85. "flags": 20,
  86. "font": "UniversNextPro-BoldCond",
  87. "color": 0,
  88. "ascender": 0.9490000009536743,
  89. "descender": -0.22300000488758087,
  90. "text": "3",
  91. "origin": [
  92. 536.37548828125,
  93. 755.3601684570312
  94. ],
  95. "bbox": [
  96. 536.37548828125,
  97. 744.921142578125,
  98. 541.5675048828125,
  99. 757.8131713867188
  100. ]
  101. }
  102. ],
  103. "wmode": 0,
  104. "dir": [
  105. 1.0,
  106. 0.0
  107. ],
  108. "bbox": [
  109. 536.37548828125,
  110. 744.921142578125,
  111. 541.5675048828125,
  112. 757.8131713867188
  113. ]
  114. },
  115. {
  116. "spans": [
  117. {
  118. "size": 8.0,
  119. "flags": 20,
  120. "font": "UniversNextPro-BoldCond",
  121. "color": 0,
  122. "ascender": 0.9490000009536743,
  123. "descender": -0.22300000488758087,
  124. "text": "Spektrum ",
  125. "origin": [
  126. 428.93170166015625,
  127. 755.3601684570312
  128. ],
  129. "bbox": [
  130. 428.93170166015625,
  131. 747.7681884765625,
  132. 458.7516174316406,
  133. 757.1441650390625
  134. ]
  135. },
  136. {
  137. "size": 8.0,
  138. "flags": 4,
  139. "font": "UniversNextPro-Cond",
  140. "color": 0,
  141. "ascender": 0.9359999895095825,
  142. "descender": -0.21400000154972076,
  143. "text": "der Wissenschaft ",
  144. "origin": [
  145. 458.431884765625,
  146. 755.3601684570312
  147. ],
  148. "bbox": [
  149. 458.431884765625,
  150. 747.8721923828125,
  151. 508.0399169921875,
  152. 757.0721435546875
  153. ]
  154. },
  155. {
  156. "size": 8.0,
  157. "flags": 4,
  158. "font": "UniversNextPro-Regular",
  159. "color": 0,
  160. "ascender": 0.9290000200271606,
  161. "descender": -0.22200000286102295,
  162. "text": "7.21",
  163. "origin": [
  164. 510.2349853515625,
  165. 755.3601684570312
  166. ],
  167. "bbox": [
  168. 510.2349853515625,
  169. 747.9281616210938,
  170. 524.5621948242188,
  171. 757.1361694335938
  172. ]
  173. }
  174. ],
  175. "wmode": 0,
  176. "dir": [
  177. 1.0,
  178. 0.0
  179. ],
  180. "bbox": [
  181. 428.93170166015625,
  182. 747.7681884765625,
  183. 524.5621948242188,
  184. 757.1441650390625
  185. ]
  186. }
  187. ]
  188. }
  189. ],
  190. "images":[
  191. {
  192. "bbox":[0,0,1,1],
  193. "image_path":"path/to/image.jpg"
  194. },
  195. {
  196. "bbox":[1,2,3,4],
  197. "image_path":"path/to/image.jpg"
  198. }
  199. ],
  200. "tables":[
  201. {
  202. "bbox":[0,0,1,1],
  203. "image_path":"path/to/image.jpg"
  204. },
  205. {
  206. "bbox":[1,2,3,4],
  207. "image_path":"path/to/image.jpg"
  208. }
  209. ],
  210. "interline_equations":[
  211. {
  212. "bbox":[0,0,1,1],
  213. "image_path":"path/to/equation.jpg"
  214. },
  215. {
  216. "bbox":[1,2,3,4],
  217. "image_path":"path/to/equation.jpg"
  218. }
  219. ],
  220. "inline_equations":[
  221. {
  222. "bbox":[0,0,1,1],
  223. "image_path":"path/to/equation.jpg"
  224. },
  225. {
  226. "bbox":[1,2,3,4],
  227. "image_path":"path/to/equation.jpg"
  228. }
  229. ],
  230. "layout_bboxes":[
  231. {
  232. "layout_bbox": [0,0, 1,1],
  233. "layout_label":"V|H|B" //未处理|垂直|水平|BAD_LAYOUT
  234. },
  235. {
  236. "layout_bbox": [1,2,3,4],
  237. "layout_label":"V|H|B"
  238. }
  239. ],
  240. "pymu_raw_blocks":[], //未删减的pymupdf的block,含文字图片等
  241. "global_statistic":{//全局性统计信息
  242. },
  243. "droped_text_block":[//被丢弃的文字
  244. ],
  245. "droped_image_block":[
  246. ],
  247. "droped_table_block":[
  248. ],
  249. "image_backup":[//暂时不参与处理的图片,例如互相层叠的图片,先放这里,最后组合的时候放到页面开头段落之后。
  250. ],
  251. "table_backup":[//同上
  252. ]
  253. },
  254. "page_1":{
  255. }
  256. }