Prechádzať zdrojové kódy

test: Update test_e2e.py

Sidney233 4 mesiacov pred
rodič
commit
6be1453c22
4 zmenil súbory, kde vykonal 276 pridanie a 312 odobranie
  1. 3 0
      mkdocs.yml
  2. 7 3
      pyproject.toml
  3. BIN
      tests/unittest/pdfs/test.pdf
  4. 266 309
      tests/unittest/test_e2e.py

+ 3 - 0
mkdocs.yml

@@ -78,6 +78,9 @@ plugins:
   - search
   - i18n:
       docs_structure: folder
+      fallback_to_default: true
+      reconfigure_material: true
+      reconfigure_search: true
       languages:
         - locale: en
           default: true

+ 7 - 3
pyproject.toml

@@ -43,7 +43,8 @@ test = [
     "pytest",
     "pytest-cov",
     "coverage",
-    "beautifulsoup4"
+    "beautifulsoup4",
+    "fuzzywuzzy"
 ]
 vlm = [
     "transformers>=4.51.1",
@@ -150,7 +151,11 @@ omit = [
     "*/cli_parser.py",
     "*/run_async.py"
 ]
+
 [tool.coverage.html]
+directory = "htmlcov"
+
+[tool.coverage.report]
 exclude_also = [
     'def __repr__',
     'if self.debug:',
@@ -162,5 +167,4 @@ exclude_also = [
     'if TYPE_CHECKING:',
     'class .*\bProtocol\):',
     '@(abc\.)?abstractmethod',
-]
-directory = "htmlcov"
+]

BIN
tests/unittest/pdfs/test.pdf


+ 266 - 309
tests/unittest/test_e2e.py

@@ -3,17 +3,15 @@ import copy
 import json
 import os
 from pathlib import Path
-
-from cryptography.hazmat.backends.openssl import backend
 from loguru import logger
-
+from bs4 import BeautifulSoup
+from fuzzywuzzy import fuzz
 from mineru.cli.common import (
     convert_pdf_bytes_to_bytes_by_pypdfium2,
     prepare_env,
     read_fn,
 )
 from mineru.data.data_reader_writer import FileBasedDataWriter
-from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
 from mineru.utils.enum_class import MakeMode
 from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
 from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
     result_to_middle_json as pipeline_result_to_middle_json,
 )
 from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
-from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
-
-
-class TestE2E:
-
-    def test_pipeline_with_two_config(self):
-        def do_parse(
-            output_dir,  # Output directory for storing parsing results
-            pdf_file_names: list[str],  # List of PDF file names to be parsed
-            pdf_bytes_list: list[bytes],  # List of PDF bytes to be parsed
-            p_lang_list: list[
-                str
-            ],  # List of languages for each PDF, default is 'ch' (Chinese)
-            parse_method="auto",  # The method for parsing PDF, default is 'auto'
-            formula_enable=True,  # Enable formula parsing
-            table_enable=True,  # Enable table parsing
-            f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
-            f_draw_span_bbox=True,  # Whether to draw span bounding boxes
-            f_dump_md=True,  # Whether to dump markdown files
-            f_dump_middle_json=True,  # Whether to dump middle JSON files
-            f_dump_model_output=True,  # Whether to dump model output files
-            f_dump_orig_pdf=True,  # Whether to dump original PDF files
-            f_dump_content_list=True,  # Whether to dump content list files
-            f_make_md_mode=MakeMode.MM_MD,  # The mode for making markdown content, default is MM_MD
-            start_page_id=0,  # Start page ID for parsing, default is 0
-            end_page_id=None,  # End page ID for parsing, default is None (parse all pages until the end of the document)
-        ):
-            for idx, pdf_bytes in enumerate(pdf_bytes_list):
-                new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
-                    pdf_bytes, start_page_id, end_page_id
-                )
-                pdf_bytes_list[idx] = new_pdf_bytes
-
-            (
-                infer_results,
-                all_image_lists,
-                all_pdf_docs,
-                lang_list,
-                ocr_enabled_list,
-            ) = pipeline_doc_analyze(
-                pdf_bytes_list,
-                p_lang_list,
-                parse_method=parse_method,
-                formula_enable=formula_enable,
-                table_enable=table_enable,
-            )
-
-            for idx, model_list in enumerate(infer_results):
-                model_json = copy.deepcopy(model_list)
-                pdf_file_name = pdf_file_names[idx]
-                local_image_dir, local_md_dir = prepare_env(
-                    output_dir, pdf_file_name, parse_method
-                )
-                image_writer, md_writer = FileBasedDataWriter(
-                    local_image_dir
-                ), FileBasedDataWriter(local_md_dir)
-
-                images_list = all_image_lists[idx]
-                pdf_doc = all_pdf_docs[idx]
-                _lang = lang_list[idx]
-                _ocr_enable = ocr_enabled_list[idx]
-                middle_json = pipeline_result_to_middle_json(
-                    model_list,
-                    images_list,
-                    pdf_doc,
-                    image_writer,
-                    _lang,
-                    _ocr_enable,
-                    formula_enable,
-                )
-
-                pdf_info = middle_json["pdf_info"]
-
-                pdf_bytes = pdf_bytes_list[idx]
-                if f_draw_layout_bbox:
-                    draw_layout_bbox(
-                        pdf_info,
-                        pdf_bytes,
-                        local_md_dir,
-                        f"{pdf_file_name}_layout.pdf",
-                    )
-
-                if f_draw_span_bbox:
-                    draw_span_bbox(
-                        pdf_info,
-                        pdf_bytes,
-                        local_md_dir,
-                        f"{pdf_file_name}_span.pdf",
-                    )
 
-                if f_dump_orig_pdf:
-                    md_writer.write(
-                        f"{pdf_file_name}_origin.pdf",
-                        pdf_bytes,
-                    )
-
-                if f_dump_md:
-                    image_dir = str(os.path.basename(local_image_dir))
-                    md_content_str = pipeline_union_make(
-                        pdf_info, f_make_md_mode, image_dir
-                    )
-                    md_writer.write_string(
-                        f"{pdf_file_name}.md",
-                        md_content_str,
-                    )
-
-                if f_dump_content_list:
-                    image_dir = str(os.path.basename(local_image_dir))
-                    content_list = pipeline_union_make(
-                        pdf_info, MakeMode.CONTENT_LIST, image_dir
-                    )
-                    md_writer.write_string(
-                        f"{pdf_file_name}_content_list.json",
-                        json.dumps(content_list, ensure_ascii=False, indent=4),
-                    )
 
-                if f_dump_middle_json:
-                    md_writer.write_string(
-                        f"{pdf_file_name}_middle.json",
-                        json.dumps(middle_json, ensure_ascii=False, indent=4),
-                    )
-
-                if f_dump_model_output:
-                    md_writer.write_string(
-                        f"{pdf_file_name}_model.json",
-                        json.dumps(model_json, ensure_ascii=False, indent=4),
-                    )
-
-                logger.info(f"local output dir is {local_md_dir}")
-
-        def parse_doc(
-            path_list: list[Path],
-            output_dir,
-            lang="ch",
-            method="auto",
-            start_page_id=0,
-            end_page_id=None,
-        ):
-            file_name_list = []
-            pdf_bytes_list = []
-            lang_list = []
-            for path in path_list:
-                file_name = str(Path(path).stem)
-                pdf_bytes = read_fn(path)
-                file_name_list.append(file_name)
-                pdf_bytes_list.append(pdf_bytes)
-                lang_list.append(lang)
-            # 运行两次 do_parse,分别是开启公式和表格解析和不开启
-            do_parse(
-                output_dir=output_dir,
-                pdf_file_names=file_name_list,
-                pdf_bytes_list=pdf_bytes_list,
-                p_lang_list=lang_list,
-                parse_method=method,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-            )
-            do_parse(
-                output_dir=output_dir,
-                pdf_file_names=file_name_list,
-                pdf_bytes_list=pdf_bytes_list,
-                p_lang_list=lang_list,
-                parse_method=method,
-                table_enable=False,
-                formula_enable=False,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-            )
-
-        __dir__ = os.path.dirname(os.path.abspath(__file__))
-        pdf_files_dir = os.path.join(__dir__, "pdfs")
-        output_dir = os.path.join(__dir__, "output")
-        pdf_suffixes = [".pdf"]
-        image_suffixes = [".png", ".jpeg", ".jpg"]
-
-        doc_path_list = []
-        for doc_path in Path(pdf_files_dir).glob("*"):
-            if doc_path.suffix in pdf_suffixes + image_suffixes:
-                doc_path_list.append(doc_path)
-
-        # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
-        parse_doc(doc_path_list, output_dir)
-
-    def test_vlm_transformers_with_default_config(self):
-        def do_parse(
-            output_dir,  # Output directory for storing parsing results
-            pdf_file_names: list[str],  # List of PDF file names to be parsed
-            pdf_bytes_list: list[bytes],  # List of PDF bytes to be parsed
-            server_url=None,  # Server URL for vlm-sglang-client backend
-            f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
-            f_dump_md=True,  # Whether to dump markdown files
-            f_dump_middle_json=True,  # Whether to dump middle JSON files
-            f_dump_model_output=True,  # Whether to dump model output files
-            f_dump_orig_pdf=True,  # Whether to dump original PDF files
-            f_dump_content_list=True,  # Whether to dump content list files
-            f_make_md_mode=MakeMode.MM_MD,  # The mode for making markdown content, default is MM_MD
-            start_page_id=0,  # Start page ID for parsing, default is 0
-            end_page_id=None,  # End page ID for parsing, default is None (parse all pages until the end of the document)
-        ):
-            backend = "transformers"
-            f_draw_span_bbox = False
-            parse_method = "vlm"
-            for idx, pdf_bytes in enumerate(pdf_bytes_list):
-                pdf_file_name = pdf_file_names[idx]
-                pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
-                    pdf_bytes, start_page_id, end_page_id
-                )
-                local_image_dir, local_md_dir = prepare_env(
-                    output_dir, pdf_file_name, parse_method
+def test_pipeline_with_two_config():
+    __dir__ = os.path.dirname(os.path.abspath(__file__))
+    pdf_files_dir = os.path.join(__dir__, "pdfs")
+    output_dir = os.path.join(__dir__, "output")
+    pdf_suffixes = [".pdf"]
+    image_suffixes = [".png", ".jpeg", ".jpg"]
+
+    doc_path_list = []
+    for doc_path in Path(pdf_files_dir).glob("*"):
+        if doc_path.suffix in pdf_suffixes + image_suffixes:
+            doc_path_list.append(doc_path)
+
+    os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
+
+    pdf_file_names = []
+    pdf_bytes_list = []
+    p_lang_list = []
+    for path in doc_path_list:
+        file_name = str(Path(path).stem)
+        pdf_bytes = read_fn(path)
+        pdf_file_names.append(file_name)
+        pdf_bytes_list.append(pdf_bytes)
+        p_lang_list.append("en")
+    for idx, pdf_bytes in enumerate(pdf_bytes_list):
+        new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
+        pdf_bytes_list[idx] = new_pdf_bytes
+
+    # 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
+    infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
+        pipeline_doc_analyze(
+            pdf_bytes_list,
+            p_lang_list,
+            parse_method="txt",
+        )
+    )
+    write_infer_result(
+        infer_results,
+        all_image_lists,
+        all_pdf_docs,
+        lang_list,
+        ocr_enabled_list,
+        pdf_file_names,
+        output_dir,
+        parse_method="txt",
+    )
+    assert_content("tests/unittest/output/test/txt/test_content_list.json")
+    infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
+        pipeline_doc_analyze(
+            pdf_bytes_list,
+            p_lang_list,
+            parse_method="ocr",
+        )
+    )
+    write_infer_result(
+        infer_results,
+        all_image_lists,
+        all_pdf_docs,
+        lang_list,
+        ocr_enabled_list,
+        pdf_file_names,
+        output_dir,
+        parse_method="ocr",
+    )
+    assert_content("tests/unittest/output/test/ocr/test_content_list.json")
+
+
+def test_vlm_transformers_with_default_config():
+    __dir__ = os.path.dirname(os.path.abspath(__file__))
+    pdf_files_dir = os.path.join(__dir__, "pdfs")
+    output_dir = os.path.join(__dir__, "output")
+    pdf_suffixes = [".pdf"]
+    image_suffixes = [".png", ".jpeg", ".jpg"]
+
+    doc_path_list = []
+    for doc_path in Path(pdf_files_dir).glob("*"):
+        if doc_path.suffix in pdf_suffixes + image_suffixes:
+            doc_path_list.append(doc_path)
+
+    os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
+
+    pdf_file_names = []
+    pdf_bytes_list = []
+    p_lang_list = []
+    for path in doc_path_list:
+        file_name = str(Path(path).stem)
+        pdf_bytes = read_fn(path)
+        pdf_file_names.append(file_name)
+        pdf_bytes_list.append(pdf_bytes)
+        p_lang_list.append("en")
+
+    for idx, pdf_bytes in enumerate(pdf_bytes_list):
+        pdf_file_name = pdf_file_names[idx]
+        pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
+        local_image_dir, local_md_dir = prepare_env(
+            output_dir, pdf_file_name, parse_method="vlm"
+        )
+        image_writer, md_writer = FileBasedDataWriter(
+            local_image_dir
+        ), FileBasedDataWriter(local_md_dir)
+        middle_json, infer_result = vlm_doc_analyze(
+            pdf_bytes, image_writer=image_writer, backend="transformers"
+        )
+
+        pdf_info = middle_json["pdf_info"]
+
+        image_dir = str(os.path.basename(local_image_dir))
+
+        md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}.md",
+            md_content_str,
+        )
+
+        content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}_content_list.json",
+            json.dumps(content_list, ensure_ascii=False, indent=4),
+        )
+
+        md_writer.write_string(
+            f"{pdf_file_name}_middle.json",
+            json.dumps(middle_json, ensure_ascii=False, indent=4),
+        )
+
+        model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
+        md_writer.write_string(
+            f"{pdf_file_name}_model_output.txt",
+            model_output,
+        )
+
+        logger.info(f"local output dir is {local_md_dir}")
+        assert_content("tests/unittest/output/test/vlm/test_content_list.json")
+
+
+def write_infer_result(
+    infer_results,
+    all_image_lists,
+    all_pdf_docs,
+    lang_list,
+    ocr_enabled_list,
+    pdf_file_names,
+    output_dir,
+    parse_method,
+):
+    for idx, model_list in enumerate(infer_results):
+        model_json = copy.deepcopy(model_list)
+        pdf_file_name = pdf_file_names[idx]
+        local_image_dir, local_md_dir = prepare_env(
+            output_dir, pdf_file_name, parse_method
+        )
+        image_writer, md_writer = FileBasedDataWriter(
+            local_image_dir
+        ), FileBasedDataWriter(local_md_dir)
+
+        images_list = all_image_lists[idx]
+        pdf_doc = all_pdf_docs[idx]
+        _lang = lang_list[idx]
+        _ocr_enable = ocr_enabled_list[idx]
+        middle_json = pipeline_result_to_middle_json(
+            model_list,
+            images_list,
+            pdf_doc,
+            image_writer,
+            _lang,
+            _ocr_enable,
+            True,
+        )
+
+        pdf_info = middle_json["pdf_info"]
+
+        image_dir = str(os.path.basename(local_image_dir))
+        # 写入 md 文件
+        md_content_str = pipeline_union_make(pdf_info, MakeMode.MM_MD, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}.md",
+            md_content_str,
+        )
+
+        content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}_content_list.json",
+            json.dumps(content_list, ensure_ascii=False, indent=4),
+        )
+
+        md_writer.write_string(
+            f"{pdf_file_name}_middle.json",
+            json.dumps(middle_json, ensure_ascii=False, indent=4),
+        )
+
+        md_writer.write_string(
+            f"{pdf_file_name}_model.json",
+            json.dumps(model_json, ensure_ascii=False, indent=4),
+        )
+
+        logger.info(f"local output dir is {local_md_dir}")
+
+
+def validate_html(html_content):
+    try:
+        soup = BeautifulSoup(html_content, "html.parser")
+        return True
+    except Exception as e:
+        return False
+
+
+def assert_content(content_path):
+    content_list = []
+    with open(content_path, "r", encoding="utf-8") as file:
+        content_list = json.load(file)
+    type_set = set()
+    for content_dict in content_list:
+        match content_dict["type"]:
+            # 图片校验,只校验 Caption
+            case "image":
+                type_set.add("image")
+                assert (
+                    content_dict["image_caption"][0].strip().lower()
+                    == "Figure 1: Figure Caption".lower()
                 )
-                image_writer, md_writer = FileBasedDataWriter(
-                    local_image_dir
-                ), FileBasedDataWriter(local_md_dir)
-                middle_json, infer_result = vlm_doc_analyze(
-                    pdf_bytes,
-                    image_writer=image_writer,
-                    backend=backend,
-                    server_url=server_url,
+            # 表格校验,校验 Caption,表格格式和表格内容
+            case "table":
+                type_set.add("table")
+                assert (
+                    content_dict["table_caption"][0].strip().lower()
+                    == "Table 1: Table Caption".lower()
                 )
-
-                pdf_info = middle_json["pdf_info"]
-
-                if f_draw_layout_bbox:
-                    draw_layout_bbox(
-                        pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf"
-                    )
-
-                if f_draw_span_bbox:
-                    draw_span_bbox(
-                        pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf"
-                    )
-
-                if f_dump_orig_pdf:
-                    md_writer.write(
-                        f"{pdf_file_name}_origin.pdf",
-                        pdf_bytes,
-                    )
-
-                if f_dump_md:
-                    image_dir = str(os.path.basename(local_image_dir))
-                    md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
-                    md_writer.write_string(
-                        f"{pdf_file_name}.md",
-                        md_content_str,
+                assert validate_html(content_dict["table_body"])
+                target_str_list = [
+                    "Linear Regression",
+                    "0.98740",
+                    "1321.2",
+                    "2-order Polynomial",
+                    "0.99906",
+                    "26.4",
+                    "3-order Polynomial",
+                    "0.99913",
+                    "101.2",
+                    "4-order Polynomial",
+                    "0.99914",
+                    "94.1",
+                    "Gray Prediction",
+                    "0.00617",
+                    "687",
+                ]
+                correct_count = 0
+                for target_str in target_str_list:
+                    if target_str in content_dict["table_body"]:
+                        correct_count += 1
+
+                assert correct_count > 0.9 * len(target_str_list)
+            # 公式校验,检测是否含有公式元素
+            case "equation":
+                type_set.add("equation")
+                target_str_list = ["$$", "lambda", "frac", "bar"]
+                for target_str in target_str_list:
+                    assert target_str in content_dict["text"]
+            # 文本校验,文本相似度超过90
+            case "text":
+                type_set.add("text")
+                assert (
+                    fuzz.ratio(
+                        content_dict["text"],
+                        "Trump graduated from the Wharton School of the University of Pennsylvania with a bachelor's degree in 1968. He became president of his father's real estate business in 1971 and renamed it The Trump Organization.",
                     )
-
-                if f_dump_content_list:
-                    image_dir = str(os.path.basename(local_image_dir))
-                    content_list = vlm_union_make(
-                        pdf_info, MakeMode.CONTENT_LIST, image_dir
-                    )
-                    md_writer.write_string(
-                        f"{pdf_file_name}_content_list.json",
-                        json.dumps(content_list, ensure_ascii=False, indent=4),
-                    )
-
-                if f_dump_middle_json:
-                    md_writer.write_string(
-                        f"{pdf_file_name}_middle.json",
-                        json.dumps(middle_json, ensure_ascii=False, indent=4),
-                    )
-
-                if f_dump_model_output:
-                    model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
-                    md_writer.write_string(
-                        f"{pdf_file_name}_model_output.txt",
-                        model_output,
-                    )
-
-                logger.info(f"local output dir is {local_md_dir}")
-
-        def parse_doc(
-            path_list: list[Path],
-            output_dir,
-            lang="ch",
-            server_url=None,
-            start_page_id=0,
-            end_page_id=None,
-        ):
-            file_name_list = []
-            pdf_bytes_list = []
-            lang_list = []
-            for path in path_list:
-                file_name = str(Path(path).stem)
-                pdf_bytes = read_fn(path)
-                file_name_list.append(file_name)
-                pdf_bytes_list.append(pdf_bytes)
-                lang_list.append(lang)
-            do_parse(
-                output_dir=output_dir,
-                pdf_file_names=file_name_list,
-                pdf_bytes_list=pdf_bytes_list,
-                server_url=server_url,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-            )
-
-        __dir__ = os.path.dirname(os.path.abspath(__file__))
-        pdf_files_dir = os.path.join(__dir__, "pdfs")
-        output_dir = os.path.join(__dir__, "output")
-        pdf_suffixes = [".pdf"]
-        image_suffixes = [".png", ".jpeg", ".jpg"]
-
-        doc_path_list = []
-        for doc_path in Path(pdf_files_dir).glob("*"):
-            if doc_path.suffix in pdf_suffixes + image_suffixes:
-                doc_path_list.append(doc_path)
-
-        # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
-        parse_doc(doc_path_list, output_dir)
+                    > 90
+                )
+    assert len(type_set) >= 4