Selaa lähdekoodia

feat: add static images

xu rui 1 vuosi sitten
vanhempi
commit
c98df758e2

+ 416 - 0
docs/en/user_guide/tutorial/output_file_description.rst

@@ -0,0 +1,416 @@
+
+Output File Description
+=========================
+
+After executing the ``magic-pdf`` command, in addition to outputting
+files related to markdown, several other files unrelated to markdown
+will also be generated. These files will be introduced one by one.
+
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+
+.. figure:: ../../_static/image/layout_example.png
+   :alt: layout example
+
+   layout example
+
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+
+.. figure:: ../../_static/image/spans_example.png
+   :alt: spans example
+
+   spans example
+
+some_pdf_model.json
+~~~~~~~~~~~~~~~~~~~
+
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+   from pydantic import BaseModel, Field
+   from enum import IntEnum
+
+   class CategoryType(IntEnum):
+        title = 0               # Title
+        plain_text = 1          # Text
+        abandon = 2             # Includes headers, footers, page numbers, and page annotations
+        figure = 3              # Image
+        figure_caption = 4      # Image description
+        table = 5               # Table
+        table_caption = 6       # Table description
+        table_footnote = 7      # Table footnote
+        isolate_formula = 8     # Block formula
+        formula_caption = 9     # Formula label
+
+        embedding = 13          # Inline formula
+        isolated = 14           # Block formula
+        text = 15               # OCR recognition result
+
+
+   class PageInfo(BaseModel):
+       page_no: int = Field(description="Page number, the first page is 0", ge=0)
+       height: int = Field(description="Page height", gt=0)
+       width: int = Field(description="Page width", ge=0)
+
+   class ObjectInferenceResult(BaseModel):
+       category_id: CategoryType = Field(description="Category", ge=0)
+       poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+       score: float = Field(description="Confidence of the inference result")
+       latex: str | None = Field(description="LaTeX parsing result", default=None)
+       html: str | None = Field(description="HTML parsing result", default=None)
+
+   class PageInferenceResults(BaseModel):
+        layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+        page_info: PageInfo = Field(description="Page metadata")
+
+
+   # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+   inference_result: list[PageInferenceResults] = []
+
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+
+example
+^^^^^^^
+
+.. code:: json
+
+   [
+       {
+           "layout_dets": [
+               {
+                   "category_id": 2,
+                   "poly": [
+                       99.1906967163086,
+                       100.3119125366211,
+                       730.3707885742188,
+                       100.3119125366211,
+                       730.3707885742188,
+                       245.81326293945312,
+                       99.1906967163086,
+                       245.81326293945312
+                   ],
+                   "score": 0.9999997615814209
+               }
+           ],
+           "page_info": {
+               "page_no": 0,
+               "height": 2339,
+               "width": 1654
+           }
+       },
+       {
+           "layout_dets": [
+               {
+                   "category_id": 5,
+                   "poly": [
+                       99.13092803955078,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2264.78076171875,
+                       99.13092803955078,
+                       2264.78076171875
+                   ],
+                   "score": 0.9999997019767761
+               }
+           ],
+           "page_info": {
+               "page_no": 1,
+               "height": 2339,
+               "width": 1654
+           }
+       }
+   ]
+
+some_pdf_middle.json
+~~~~~~~~~~~~~~~~~~~~
+
++-------+--------------------------------------------------------------+
+| Field | Description                                                  |
+| Name  |                                                              |
++=======+==============================================================+
+| pdf   | list, each element is a dict representing the parsing result |
+| _info | of each PDF page, see the table below for details            |
++-------+--------------------------------------------------------------+
+| \_    | ocr \| txt, used to indicate the mode used in this           |
+| parse | intermediate parsing state                                   |
+| _type |                                                              |
++-------+--------------------------------------------------------------+
+| \_ve  | string, indicates the version of magic-pdf used in this      |
+| rsion | parsing                                                      |
+| _name |                                                              |
++-------+--------------------------------------------------------------+
+
+**pdf_info**
+
+Field structure description
+
++---------+------------------------------------------------------------+
+| Field   | Description                                                |
+| Name    |                                                            |
++=========+============================================================+
+| preproc | Intermediate result after PDF preprocessing, not yet       |
+| _blocks | segmented                                                  |
++---------+------------------------------------------------------------+
+| layout  | Layout segmentation results, containing layout direction   |
+| _bboxes | (vertical, horizontal), and bbox, sorted by reading order  |
++---------+------------------------------------------------------------+
+| p       | Page number, starting from 0                               |
+| age_idx |                                                            |
++---------+------------------------------------------------------------+
+| pa      | Page width and height                                      |
+| ge_size |                                                            |
++---------+------------------------------------------------------------+
+| \_layo  | Layout tree structure                                      |
+| ut_tree |                                                            |
++---------+------------------------------------------------------------+
+| images  | list, each element is a dict representing an img_block     |
++---------+------------------------------------------------------------+
+| tables  | list, each element is a dict representing a table_block    |
++---------+------------------------------------------------------------+
+| inter   | list, each element is a dict representing an               |
+| line_eq | interline_equation_block                                   |
+| uations |                                                            |
++---------+------------------------------------------------------------+
+| di      | List, block information returned by the model that needs   |
+| scarded | to be dropped                                              |
+| _blocks |                                                            |
++---------+------------------------------------------------------------+
+| para    | Result after segmenting preproc_blocks                     |
+| _blocks |                                                            |
++---------+------------------------------------------------------------+
+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+
+**block**
+
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+
++---------+-------------------------------------------------------------+
+| Field   | Description                                                 |
+| Name    |                                                             |
++=========+=============================================================+
+| type    | Block type (table|image)                                    |
++---------+-------------------------------------------------------------+
+| bbox    | Block bounding box coordinates                              |
++---------+-------------------------------------------------------------+
+| blocks  | list, each element is a dict representing a second-level    |
+|         | block                                                       |
++---------+-------------------------------------------------------------+
+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+
+The fields in a second-level block include:
+
++-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
++=====+================================================================+
+| t   | Block type                                                     |
+| ype |                                                                |
++-----+----------------------------------------------------------------+
+| b   | Block bounding box coordinates                                 |
+| box |                                                                |
++-----+----------------------------------------------------------------+
+| li  | list, each element is a dict representing a line, used to      |
+| nes | describe the composition of a line of information              |
++-----+----------------------------------------------------------------+
+
+Detailed explanation of second-level block types
+
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+
+**line**
+
+The field format of a line is as follows:
+
++-----+----------------------------------------------------------------+
+| Fi  | Description                                                    |
+| eld |                                                                |
+| N   |                                                                |
+| ame |                                                                |
++=====+================================================================+
+| b   | Bounding box coordinates of the line                           |
+| box |                                                                |
++-----+----------------------------------------------------------------+
+| sp  | list, each element is a dict representing a span, used to      |
+| ans | describe the composition of the smallest unit                  |
++-----+----------------------------------------------------------------+
+
+**span**
+
++----------+-----------------------------------------------------------+
+| Field    | Description                                               |
+| Name     |                                                           |
++==========+===========================================================+
+| bbox     | Bounding box coordinates of the span                      |
++----------+-----------------------------------------------------------+
+| type     | Type of the span                                          |
++----------+-----------------------------------------------------------+
+| content  | Text spans use content, chart spans use img_path to store |
+| \|       | the actual text or screenshot path information            |
+| img_path |                                                           |
++----------+-----------------------------------------------------------+
+
+The types of spans are as follows:
+
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+
+**Summary**
+
+A span is the smallest storage unit for all elements.
+
+The elements stored within para_blocks are block information.
+
+The block structure is as follows:
+
+First-level block (if any) -> Second-level block -> Line -> Span
+
+.. _example-1:
+
+example
+^^^^^^^
+
+.. code:: json
+
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+
+.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png

BIN
next_docs/en/_static/image/MinerU-logo-hq.png


BIN
next_docs/en/_static/image/MinerU-logo.png


BIN
next_docs/en/_static/image/datalab_logo.png


BIN
next_docs/en/_static/image/flowchart_en.png


BIN
next_docs/en/_static/image/flowchart_zh_cn.png


BIN
next_docs/en/_static/image/layout_example.png


BIN
next_docs/en/_static/image/poly.png


BIN
next_docs/en/_static/image/project_panorama_en.png


BIN
next_docs/en/_static/image/project_panorama_zh_cn.png


BIN
next_docs/en/_static/image/spans_example.png


BIN
next_docs/en/_static/image/web_demo_1.png


+ 8 - 1
next_docs/en/additional_notes/changelog.rst

@@ -16,4 +16,11 @@ Changelog
    process, added table recognition functionality
 -  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
    issues and installation documentation
--  2024/07/05: Initial open-source release
+-  2024/07/05: Initial open-source release
+
+
+.. warning::
+
+   fix ``localized deployment version`` and ``front-end interface``
+
+

+ 2 - 0
next_docs/en/additional_notes/glossary.rst

@@ -6,4 +6,6 @@ Glossary
 1. jsonl 
     TODO: add description
 
+2. magic-pdf.json
+    TODO: add description
 

+ 1 - 1
next_docs/en/user_guide/quick_start.rst

@@ -9,5 +9,5 @@ Eager to get started? This page gives a good introduction to MinerU. Follow Inst
     :maxdepth: 1
 
     quick_start/command_line
-    quick_start/extract_text
+    quick_start/to_markdown
 

+ 1 - 2
next_docs/en/user_guide/quick_start/command_line.rst

@@ -55,6 +55,5 @@ directory. The output file list is as follows:
    ├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
    └── some_pdf_content_list.json           # Rich text JSON arranged in reading order
 
-For more information about the output files, please refer to the `Output
-File Description <docs/output_file_en_us.md>`__.
+For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
 

+ 52 - 0
next_docs/en/user_guide/quick_start/to_markdown.rst

@@ -0,0 +1,52 @@
+
+
+Convert To Markdown
+========================
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+    from magic_pdf.pipe.OCRPipe import OCRPipe
+
+
+    ## args
+    model_list = []
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+
+
+    ## prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    ) # create 00
+    image_dir = str(os.path.basename(local_image_dir))
+
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)   # read the pdf content
+
+
+    pipe = OCRPipe(pdf_bytes, model_list, image_writer)
+
+    pipe.pipe_classify()
+    pipe.pipe_analyze()
+    pipe.pipe_parse()
+
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+
+
+    md_content = pipe.pipe_mk_markdown(
+        image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
+    )
+
+    if isinstance(md_content, list):
+        md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
+    else:
+        md_writer.write_string(f"{pdf_file_name}.md", md_content)
+
+
+Check :doc:`../data/data_reader_writer` for more [reader | writer] examples 

+ 6 - 1
next_docs/en/user_guide/tutorial.rst

@@ -1,5 +1,10 @@
 
 Tutorial
-----------
+===========
 
 From the beginning to the end, Show how to using mineru via a minimal project
+
+.. toctree::
+    :maxdepth: 1
+
+    tutorial/output_file_description