output_file_description.rst 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. 输出文件格式介绍
  2. ===============
  3. ``magic-pdf`` 命令执行后除了输出和 markdown
  4. 有关的文件以外,还会生成若干个和 markdown
  5. 无关的文件。现在将一一介绍这些文件
  6. some_pdf_layout.pdf
  7. ~~~~~~~~~~~~~~~~~~~
  8. 每一页的 layout 均由一个或多个框组成。
  9. 每个框左上脚的数字表明它们的序号。此外 layout.pdf
  10. 框内用不同的背景色块圈定不同的内容块。
  11. .. figure:: ../../_static/image/layout_example.png
  12. :alt: layout 页面示例
  13. layout 页面示例
  14. some_pdf_spans.pdf
  15. ~~~~~~~~~~~~~~~~~~
  16. 根据 span 类型的不同,采用不同颜色线框绘制页面上所有
  17. span。该文件可以用于质检,可以快速排查出文本丢失、行间公式未识别等问题。
  18. .. figure:: ../../_static/image/spans_example.png
  19. :alt: span 页面示例
  20. span 页面示例
  21. some_pdf_model.json
  22. ~~~~~~~~~~~~~~~~~~~
  23. 结构定义
  24. ^^^^^^^^
  25. .. code:: python
  26. from pydantic import BaseModel, Field
  27. from enum import IntEnum
  28. class CategoryType(IntEnum):
  29. title = 0 # 标题
  30. plain_text = 1 # 文本
  31. abandon = 2 # 包括页眉页脚页码和页面注释
  32. figure = 3 # 图片
  33. figure_caption = 4 # 图片描述
  34. table = 5 # 表格
  35. table_caption = 6 # 表格描述
  36. table_footnote = 7 # 表格注释
  37. isolate_formula = 8 # 行间公式
  38. formula_caption = 9 # 行间公式的标号
  39. embedding = 13 # 行内公式
  40. isolated = 14 # 行间公式
  41. text = 15 # ocr 识别结果
  42. class PageInfo(BaseModel):
  43. page_no: int = Field(description="页码序号,第一页的序号是 0", ge=0)
  44. height: int = Field(description="页面高度", gt=0)
  45. width: int = Field(description="页面宽度", ge=0)
  46. class ObjectInferenceResult(BaseModel):
  47. category_id: CategoryType = Field(description="类别", ge=0)
  48. poly: list[float] = Field(description="四边形坐标, 分别是 左上,右上,右下,左下 四点的坐标")
  49. score: float = Field(description="推理结果的置信度")
  50. latex: str | None = Field(description="latex 解析结果", default=None)
  51. html: str | None = Field(description="html 解析结果", default=None)
  52. class PageInferenceResults(BaseModel):
  53. layout_dets: list[ObjectInferenceResult] = Field(description="页面识别结果", ge=0)
  54. page_info: PageInfo = Field(description="页面元信息")
  55. # 所有页面的推理结果按照页码顺序依次放到列表中即为 minerU 推理结果
  56. inference_result: list[PageInferenceResults] = []
  57. poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
  58. 分别表示左上、右上、右下、左下四点的坐标 |poly 坐标示意图|
  59. 示例数据
  60. ^^^^^^^^
  61. .. code:: json
  62. [
  63. {
  64. "layout_dets": [
  65. {
  66. "category_id": 2,
  67. "poly": [
  68. 99.1906967163086,
  69. 100.3119125366211,
  70. 730.3707885742188,
  71. 100.3119125366211,
  72. 730.3707885742188,
  73. 245.81326293945312,
  74. 99.1906967163086,
  75. 245.81326293945312
  76. ],
  77. "score": 0.9999997615814209
  78. }
  79. ],
  80. "page_info": {
  81. "page_no": 0,
  82. "height": 2339,
  83. "width": 1654
  84. }
  85. },
  86. {
  87. "layout_dets": [
  88. {
  89. "category_id": 5,
  90. "poly": [
  91. 99.13092803955078,
  92. 2210.680419921875,
  93. 497.3183898925781,
  94. 2210.680419921875,
  95. 497.3183898925781,
  96. 2264.78076171875,
  97. 99.13092803955078,
  98. 2264.78076171875
  99. ],
  100. "score": 0.9999997019767761
  101. }
  102. ],
  103. "page_info": {
  104. "page_no": 1,
  105. "height": 2339,
  106. "width": 1654
  107. }
  108. }
  109. ]
  110. some_pdf_middle.json
  111. ~~~~~~~~~~~~~~~~~~~~
  112. +-----------+----------------------------------------------------------+
  113. | 字段名 | 解释 |
  114. +===========+==========================================================+
  115. | pdf_info | list,每个 |
  116. | | 元素都是一个dict,这个dict是每一页pdf的解析结果,详见下表 |
  117. +-----------+----------------------------------------------------------+
  118. | \_p | ocr \| txt,用来标识本次解析的中间态使用的模式 |
  119. | arse_type | |
  120. +-----------+----------------------------------------------------------+
  121. | \_ver | string, 表示本次解析使用的 magic-pdf 的版本号 |
  122. | sion_name | |
  123. +-----------+----------------------------------------------------------+
  124. **pdf_info** 字段结构说明
  125. +--------------+-------------------------------------------------------+
  126. | 字段名 | 解释 |
  127. +==============+=======================================================+
  128. | pr | pdf预处理后,未分段的中间结果 |
  129. | eproc_blocks | |
  130. +--------------+-------------------------------------------------------+
  131. | l | 布局分割的结果, |
  132. | ayout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
  133. +--------------+-------------------------------------------------------+
  134. | page_idx | 页码,从0开始 |
  135. +--------------+-------------------------------------------------------+
  136. | page_size | 页面的宽度和高度 |
  137. +--------------+-------------------------------------------------------+
  138. | \ | 布局树状结构 |
  139. | _layout_tree | |
  140. +--------------+-------------------------------------------------------+
  141. | images | list,每个元素是一个dict,每个dict表示一个img_block |
  142. +--------------+-------------------------------------------------------+
  143. | tables | list,每个元素是一个dict,每个dict表示一个table_block |
  144. +--------------+-------------------------------------------------------+
  145. | interli | list,每个元素 |
  146. | ne_equations | 是一个dict,每个dict表示一个interline_equation_block |
  147. +--------------+-------------------------------------------------------+
  148. | disc | List, 模型返回的需要drop的block信息 |
  149. | arded_blocks | |
  150. +--------------+-------------------------------------------------------+
  151. | para_blocks | 将preproc_blocks进行分段之后的结果 |
  152. +--------------+-------------------------------------------------------+
  153. 上表中 ``para_blocks``
  154. 是个dict的数组,每个dict是一个block结构,block最多支持一次嵌套
  155. **block**
  156. 外层block被称为一级block,一级block中的字段包括
  157. ====== ===============================================
  158. 字段名 解释
  159. ====== ===============================================
  160. type block类型(table|image)
  161. bbox block矩形框坐标
  162. blocks list,里面的每个元素都是一个dict格式的二级block
  163. ====== ===============================================
  164. 一级block只有”table”和”image”两种类型,其余block均为二级block
  165. 二级block中的字段包括
  166. +-----+----------------------------------------------------------------+
  167. | 字 | 解释 |
  168. | 段 | |
  169. | 名 | |
  170. +=====+================================================================+
  171. | t | block类型 |
  172. | ype | |
  173. +-----+----------------------------------------------------------------+
  174. | b | block矩形框坐标 |
  175. | box | |
  176. +-----+----------------------------------------------------------------+
  177. | li | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
  178. | nes | |
  179. +-----+----------------------------------------------------------------+
  180. 二级block的类型详解
  181. ================== ==============
  182. type desc
  183. ================== ==============
  184. image_body 图像的本体
  185. image_caption 图像的描述文本
  186. image_footnote 图像的脚注
  187. table_body 表格本体
  188. table_caption 表格的描述文本
  189. table_footnote 表格的脚注
  190. text 文本块
  191. title 标题块
  192. index 目录块
  193. list 列表块
  194. interline_equation 行间公式块
  195. ================== ==============
  196. **line**
  197. line 的 字段格式如下
  198. +----+-----------------------------------------------------------------+
  199. | 字 | 解释 |
  200. | 段 | |
  201. | 名 | |
  202. +====+=================================================================+
  203. | bb | line的矩形框坐标 |
  204. | ox | |
  205. +----+-----------------------------------------------------------------+
  206. | s | list, |
  207. | pa | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
  208. | ns | |
  209. +----+-----------------------------------------------------------------+
  210. **span**
  211. +------------+---------------------------------------------------------+
  212. | 字段名 | 解释 |
  213. +============+=========================================================+
  214. | bbox | span的矩形框坐标 |
  215. +------------+---------------------------------------------------------+
  216. | type | span的类型 |
  217. +------------+---------------------------------------------------------+
  218. | content \| | 文本类型的span使用content,图表类使用img_path |
  219. | img_path | 用来存储实际的文本或者截图路径信息 |
  220. +------------+---------------------------------------------------------+
  221. span 的类型有如下几种
  222. ================== ========
  223. type desc
  224. ================== ========
  225. image 图片
  226. table 表格
  227. text 文本
  228. inline_equation 行内公式
  229. interline_equation 行间公式
  230. ================== ========
  231. **总结**
  232. span是所有元素的最小存储单元
  233. para_blocks内存储的元素为区块信息
  234. 区块结构为
  235. 一级block(如有)->二级block->line->span
  236. .. _示例数据-1:
  237. 示例数据
  238. ^^^^^^^^
  239. .. code:: json
  240. {
  241. "pdf_info": [
  242. {
  243. "preproc_blocks": [
  244. {
  245. "type": "text",
  246. "bbox": [
  247. 52,
  248. 61.956024169921875,
  249. 294,
  250. 82.99800872802734
  251. ],
  252. "lines": [
  253. {
  254. "bbox": [
  255. 52,
  256. 61.956024169921875,
  257. 294,
  258. 72.0000228881836
  259. ],
  260. "spans": [
  261. {
  262. "bbox": [
  263. 54.0,
  264. 61.956024169921875,
  265. 296.2261657714844,
  266. 72.0000228881836
  267. ],
  268. "content": "dependent on the service headway and the reliability of the departure ",
  269. "type": "text",
  270. "score": 1.0
  271. }
  272. ]
  273. }
  274. ]
  275. }
  276. ],
  277. "layout_bboxes": [
  278. {
  279. "layout_bbox": [
  280. 52,
  281. 61,
  282. 294,
  283. 731
  284. ],
  285. "layout_label": "V",
  286. "sub_layout": []
  287. }
  288. ],
  289. "page_idx": 0,
  290. "page_size": [
  291. 612.0,
  292. 792.0
  293. ],
  294. "_layout_tree": [],
  295. "images": [],
  296. "tables": [],
  297. "interline_equations": [],
  298. "discarded_blocks": [],
  299. "para_blocks": [
  300. {
  301. "type": "text",
  302. "bbox": [
  303. 52,
  304. 61.956024169921875,
  305. 294,
  306. 82.99800872802734
  307. ],
  308. "lines": [
  309. {
  310. "bbox": [
  311. 52,
  312. 61.956024169921875,
  313. 294,
  314. 72.0000228881836
  315. ],
  316. "spans": [
  317. {
  318. "bbox": [
  319. 54.0,
  320. 61.956024169921875,
  321. 296.2261657714844,
  322. 72.0000228881836
  323. ],
  324. "content": "dependent on the service headway and the reliability of the departure ",
  325. "type": "text",
  326. "score": 1.0
  327. }
  328. ]
  329. }
  330. ]
  331. }
  332. ]
  333. }
  334. ],
  335. "_parse_type": "txt",
  336. "_version_name": "0.6.1"
  337. }
  338. .. |poly 坐标示意图| image:: ../../_static/image/poly.png