瀏覽代碼

fix: add magic-pdf-dev case

quyuan 1 年之前
父節點
當前提交
8df8737ef4

+ 4 - 0
LICENSE.md

@@ -659,3 +659,7 @@ specific requirements.
 if any, to sign a "copyright disclaimer" for the program, if necessary.
 if any, to sign a "copyright disclaimer" for the program, if necessary.
 For more information on this, and how to apply and follow the GNU AGPL, see
 For more information on this, and how to apply and follow the GNU AGPL, see
 <https://www.gnu.org/licenses/>.
 <https://www.gnu.org/licenses/>.
+
+
+
+$^1$

+ 26 - 0
mv_pdf.py

@@ -0,0 +1,26 @@
+import os
+import shutil
+
+def move_pdfs(root_folder, destination_folder):
+    # 遍历根目录及其子目录中的所有文件
+    for root, dirs, files in os.walk(root_folder):
+        for file in files:
+            if file.endswith('.pdf'):
+                # 构建完整的文件路径
+                src_path = os.path.join(root, file)
+                # 构建目标路径
+                dst_path = os.path.join(destination_folder, file)
+                
+                # 移动文件
+                shutil.move(src_path, dst_path)
+                print(f'Moved {file} to {destination_folder}')
+
+# 使用方法
+root_folder = r'D:\mineru\datasets\datasets'  # 源文件夹路径
+destination_folder = r'D:\mineru\datasets\pdf'  # 目标文件夹路径
+
+# 创建目标文件夹如果不存在
+if not os.path.exists(destination_folder):
+    os.makedirs(destination_folder)
+
+move_pdfs(root_folder, destination_folder)

二進制
tests.zip


+ 2 - 1
tests/retry_env.sh

@@ -8,7 +8,8 @@ while true; do
     # prepare env
     # prepare env
     source activate MinerU
     source activate MinerU
     pip install -r requirements-qa.txt
     pip install -r requirements-qa.txt
-    pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip uninstall magic-pdf
+    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
     pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
     pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
     exit_code=$?
     exit_code=$?
     if [ $exit_code -eq 0 ]; then
     if [ $exit_code -eq 0 ]; then

+ 3 - 3
tests/test_cli/conf/conf.py

@@ -2,6 +2,6 @@ import os
 conf = {
 conf = {
 "code_path": os.environ.get('GITHUB_WORKSPACE'),
 "code_path": os.environ.get('GITHUB_WORKSPACE'),
 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
-"pdf_res_path": "/tmp/magic-pdf"
-}
-
+"pdf_res_path": "/tmp/magic-pdf",
+"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl"
+}

文件差異過大導致無法顯示
+ 0 - 0
tests/test_cli/pdf_dev/line1.jsonl


+ 0 - 54
tests/test_cli/test_bench.py

@@ -1,54 +0,0 @@
-"""
-bench
-"""
-import os
-import shutil
-import json
-from lib import calculate_score
-import pytest
-from conf import conf
-
-code_path = os.environ.get('GITHUB_WORKSPACE')
-pdf_dev_path = conf.conf["pdf_dev_path"]
-pdf_res_path = conf.conf["pdf_res_path"]
-
-class TestBench():
-    """
-    test bench
-    """
-    def test_ci_ben(self):
-        """
-        ci benchmark
-        """
-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
-        lines = fr.readlines()
-        last_line = lines[-1].strip()
-        last_score = json.loads(last_line)
-        last_simscore = last_score["average_sim_score"]
-        last_editdistance = last_score["average_edit_distance"]
-        last_bleu = last_score["average_bleu_score"]
-        os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
-        now_score = get_score()
-        print ("now_score:", now_score)
-        if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
-            os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
-        fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
-        fw.write(json.dumps(now_score) + "\n")
-        now_simscore = now_score["average_sim_score"]
-        now_editdistance = now_score["average_edit_distance"]
-        now_bleu = now_score["average_bleu_score"]
-        assert last_simscore <= now_simscore
-        assert last_editdistance <= now_editdistance
-        assert last_bleu <= now_bleu
-
-
-def get_score():
-    """
-    get score
-    """
-    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
-    score.calculate_similarity_total("mineru", pdf_dev_path)
-    res = score.summary_scores()
-    return res
-
-

+ 50 - 0
tests/test_cli/test_cli_sdk.py

@@ -178,6 +178,56 @@ class TestCli:
             common.cli_count_folders_and_check_contents(
             common.cli_count_folders_and_check_contents(
                 os.path.join(res_path, demo_name, 'ocr'))
                 os.path.join(res_path, demo_name, 'ocr'))
 
 
+    @pytest.mark.P1
+    def test_pdf_dev_cli_local_jsonl_txt(self):
+        """magic_pdf_dev cli local txt."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
+        logging.info(cmd)
+        os.system(cmd)
+
+
+    @pytest.mark.P1
+    def test_pdf_dev_cli_local_jsonl_ocr(self):
+        """magic_pdf_dev cli local ocr."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
+        logging.info(cmd)
+        os.system(cmd)
+
+    @pytest.mark.P1
+    def test_pdf_dev_cli_local_jsonl_auto(self):
+        """magic_pdf_dev cli local auto."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'auto')
+        logging.info(cmd)
+        os.system(cmd)
+
+    @pytest.mark.P1
+    def test_pdf_dev_cli_s3_jsonl_txt(self):
+        """magic_pdf_dev cli s3 txt."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
+        logging.info(cmd)
+        os.system(cmd)
+
+
+    @pytest.mark.P1
+    def test_pdf_dev_cli_s3_jsonl_ocr(self):
+        """magic_pdf_dev cli s3 ocr."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
+        logging.info(cmd)
+        os.system(cmd)
+
+    @pytest.mark.P1
+    def test_pdf_dev_cli_s3_jsonl_auto(self):
+        """magic_pdf_dev cli s3 auto."""
+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
+        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
+        logging.info(cmd)
+        os.system(cmd)
+
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     pytest.main()
     pytest.main()

+ 0 - 0
tests/test_cli/test_magic-pdf-dev_cli.py


+ 36 - 0
tests/test_cli/test_performence.py

@@ -0,0 +1,36 @@
+"""
+test performance
+"""
+import os
+import shutil
+import json
+from lib import calculate_score
+import pytest
+from conf import conf
+
+code_path = os.environ.get('GITHUB_WORKSPACE')
+pdf_dev_path = conf.conf["pdf_dev_path"]
+pdf_res_path = conf.conf["pdf_res_path"]
+
+class TestTable():
+    """
+    test table
+    """
+    def test_perf_close_table(self):
+        """
+        test perf when close table
+        """
+
+
+
+
+def get_score():
+    """
+    get score
+    """
+    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
+    score.calculate_similarity_total("mineru", pdf_dev_path)
+    res = score.summary_scores()
+    return res
+
+

+ 54 - 0
tests/test_cli/test_table.py

@@ -0,0 +1,54 @@
+"""
+test table case
+"""
+import os
+import shutil
+import json
+from lib import calculate_score
+import pytest
+from conf import conf
+
+code_path = os.environ.get('GITHUB_WORKSPACE')
+pdf_dev_path = conf.conf["pdf_dev_path"]
+pdf_res_path = conf.conf["pdf_res_path"]
+
+class TestTable():
+    """
+    test table
+    """
+    def test_paddle_table_master_cuda(self):
+        """
+        select table: paddle table master,mode is cuda
+        """
+    def test_paddle_table_master_cpu(self):
+        """
+        select table: paddle table master, mode is cpu
+        """
+    def test_st_table_cuda(self):
+        """
+        select table: ST, mode is cuda 
+        """
+
+    def test_st_table_cpu(self):
+        """
+        select table: ST, mode is cpu
+        """
+
+    def test_close_table_cuda(self):
+        """
+        close table, mode is cuda
+        """
+    
+
+
+
+def get_score():
+    """
+    get score
+    """
+    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
+    score.calculate_similarity_total("mineru", pdf_dev_path)
+    res = score.summary_scores()
+    return res
+
+

+ 0 - 0
tests/test_table/assets/table.jpg → tests/unittest/test_table/assets/table.jpg


+ 2 - 2
tests/test_table/test_tablemaster.py → tests/unittest/test_table/test_tablemaster.py

@@ -4,10 +4,10 @@ from magic_pdf.model.ppTableModel import ppTableModel
 
 
 class TestppTableModel(unittest.TestCase):
 class TestppTableModel(unittest.TestCase):
     def test_image2html(self):
     def test_image2html(self):
-        img = Image.open("tests/test_table/assets/table.jpg")
+        img = Image.open("tests/unittest/test_table/assets/table.jpg")
         # 修改table模型路径
         # 修改table模型路径
         config = {"device": "cuda",
         config = {"device": "cuda",
-                  "model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
+                  "model_dir": "/home/quyuan/PDF-Extract-Kit/models/TabRec/TableMaster"}
         table_model = ppTableModel(config)
         table_model = ppTableModel(config)
         res = table_model.img2html(img)
         res = table_model.img2html(img)
         true_value = """<td><table  border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""
         true_value = """<td><table  border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""

部分文件因文件數量過多而無法顯示