1 年之前 · 8df8737ef4
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -659,3 +659,7 @@ specific requirements.
 
															 if any, to sign a "copyright disclaimer" for the program, if necessary.
														
 
															 For more information on this, and how to apply and follow the GNU AGPL, see
														
 
															 <https://www.gnu.org/licenses/>.
														
 
															+
														
 
															+
														
 
															+
														
 
															+$^1$
														
--- a/mv_pdf.py
+++ b/mv_pdf.py
@@ -0,0 +1,26 @@
 
															+import os
														
 
															+import shutil
														
 
															+
														
 
															+def move_pdfs(root_folder, destination_folder):
														
 
															+    # 遍历根目录及其子目录中的所有文件
														
 
															+    for root, dirs, files in os.walk(root_folder):
														
 
															+        for file in files:
														
 
															+            if file.endswith('.pdf'):
														
 
															+                # 构建完整的文件路径
														
 
															+                src_path = os.path.join(root, file)
														
 
															+                # 构建目标路径
														
 
															+                dst_path = os.path.join(destination_folder, file)
														
 
															+                
														
 
															+                # 移动文件
														
 
															+                shutil.move(src_path, dst_path)
														
 
															+                print(f'Moved {file} to {destination_folder}')
														
 
															+
														
 
															+# 使用方法
														
 
															+root_folder = r'D:\mineru\datasets\datasets'  # 源文件夹路径
														
 
															+destination_folder = r'D:\mineru\datasets\pdf'  # 目标文件夹路径
														
 
															+
														
 
															+# 创建目标文件夹如果不存在
														
 
															+if not os.path.exists(destination_folder):
														
 
															+    os.makedirs(destination_folder)
														
 
															+
														
 
															+move_pdfs(root_folder, destination_folder)
														
--- a/tests.zip
+++ b/tests.zip
--- a/tests/retry_env.sh
+++ b/tests/retry_env.sh
@@ -8,7 +8,8 @@ while true; do
 
															     # prepare env
														
 
															     source activate MinerU
														
 
															     pip install -r requirements-qa.txt
														
 
															-    pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
														
 
															+    pip uninstall magic-pdf
														
 
															+    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
														
 
															     pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
														
 
															     exit_code=$?
														
 
															     if [ $exit_code -eq 0 ]; then
														
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -2,6 +2,6 @@ import os
 
															 conf = {
														
 
															 "code_path": os.environ.get('GITHUB_WORKSPACE'),
														
 
															 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
														
 
															-"pdf_res_path": "/tmp/magic-pdf"
														
 
															-}
														
 
															-
														
 
															+"pdf_res_path": "/tmp/magic-pdf",
														
 
															+"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl"
														
 
															+}
														
--- a/tests/test_cli/pdf_dev/line1.jsonl
+++ b/tests/test_cli/pdf_dev/line1.jsonl
--- a/tests/test_cli/test_bench.py
+++ b/tests/test_cli/test_bench.py
@@ -1,54 +0,0 @@
 
															-"""
														
 
															-bench
														
 
															-"""
														
 
															-import os
														
 
															-import shutil
														
 
															-import json
														
 
															-from lib import calculate_score
														
 
															-import pytest
														
 
															-from conf import conf
														
 
															-
														
 
															-code_path = os.environ.get('GITHUB_WORKSPACE')
														
 
															-pdf_dev_path = conf.conf["pdf_dev_path"]
														
 
															-pdf_res_path = conf.conf["pdf_res_path"]
														
 
															-
														
 
															-class TestBench():
														
 
															-    """
														
 
															-    test bench
														
 
															-    """
														
 
															-    def test_ci_ben(self):
														
 
															-        """
														
 
															-        ci benchmark
														
 
															-        """
														
 
															-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
														
 
															-        lines = fr.readlines()
														
 
															-        last_line = lines[-1].strip()
														
 
															-        last_score = json.loads(last_line)
														
 
															-        last_simscore = last_score["average_sim_score"]
														
 
															-        last_editdistance = last_score["average_edit_distance"]
														
 
															-        last_bleu = last_score["average_bleu_score"]
														
 
															-        os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
														
 
															-        now_score = get_score()
														
 
															-        print ("now_score:", now_score)
														
 
															-        if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
														
 
															-            os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
														
 
															-        fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
														
 
															-        fw.write(json.dumps(now_score) + "\n")
														
 
															-        now_simscore = now_score["average_sim_score"]
														
 
															-        now_editdistance = now_score["average_edit_distance"]
														
 
															-        now_bleu = now_score["average_bleu_score"]
														
 
															-        assert last_simscore <= now_simscore
														
 
															-        assert last_editdistance <= now_editdistance
														
 
															-        assert last_bleu <= now_bleu
														
 
															-
														
 
															-
														
 
															-def get_score():
														
 
															-    """
														
 
															-    get score
														
 
															-    """
														
 
															-    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
														
 
															-    score.calculate_similarity_total("mineru", pdf_dev_path)
														
 
															-    res = score.summary_scores()
														
 
															-    return res
														
 
															-
														
 
															-
														
--- a/tests/test_cli/test_cli_sdk.py
+++ b/tests/test_cli/test_cli_sdk.py
@@ -178,6 +178,56 @@ class TestCli:
 
															             common.cli_count_folders_and_check_contents(
														
 
															                 os.path.join(res_path, demo_name, 'ocr'))
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_local_jsonl_txt(self):
														
 
															+        """magic_pdf_dev cli local txt."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															+
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_local_jsonl_ocr(self):
														
 
															+        """magic_pdf_dev cli local ocr."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_local_jsonl_auto(self):
														
 
															+        """magic_pdf_dev cli local auto."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'auto')
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_s3_jsonl_txt(self):
														
 
															+        """magic_pdf_dev cli s3 txt."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, "txt")
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															+
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_s3_jsonl_ocr(self):
														
 
															+        """magic_pdf_dev cli s3 ocr."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s -m %s' % (jsonl_path, 'ocr')
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															+    @pytest.mark.P1
														
 
															+    def test_pdf_dev_cli_s3_jsonl_auto(self):
														
 
															+        """magic_pdf_dev cli s3 auto."""
														
 
															+        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
														
 
															+        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
														
 
															+        logging.info(cmd)
														
 
															+        os.system(cmd)
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															     pytest.main()
														
--- a/tests/test_cli/test_magic-pdf-dev_cli.py
+++ b/tests/test_cli/test_magic-pdf-dev_cli.py
--- a/tests/test_cli/test_performence.py
+++ b/tests/test_cli/test_performence.py
@@ -0,0 +1,36 @@
 
															+"""
														
 
															+test performance
														
 
															+"""
														
 
															+import os
														
 
															+import shutil
														
 
															+import json
														
 
															+from lib import calculate_score
														
 
															+import pytest
														
 
															+from conf import conf
														
 
															+
														
 
															+code_path = os.environ.get('GITHUB_WORKSPACE')
														
 
															+pdf_dev_path = conf.conf["pdf_dev_path"]
														
 
															+pdf_res_path = conf.conf["pdf_res_path"]
														
 
															+
														
 
															+class TestTable():
														
 
															+    """
														
 
															+    test table
														
 
															+    """
														
 
															+    def test_perf_close_table(self):
														
 
															+        """
														
 
															+        test perf when close table
														
 
															+        """
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+def get_score():
														
 
															+    """
														
 
															+    get score
														
 
															+    """
														
 
															+    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
														
 
															+    score.calculate_similarity_total("mineru", pdf_dev_path)
														
 
															+    res = score.summary_scores()
														
 
															+    return res
														
 
															+
														
 
															+
														
--- a/tests/test_cli/test_table.py
+++ b/tests/test_cli/test_table.py
@@ -0,0 +1,54 @@
 
															+"""
														
 
															+test table case
														
 
															+"""
														
 
															+import os
														
 
															+import shutil
														
 
															+import json
														
 
															+from lib import calculate_score
														
 
															+import pytest
														
 
															+from conf import conf
														
 
															+
														
 
															+code_path = os.environ.get('GITHUB_WORKSPACE')
														
 
															+pdf_dev_path = conf.conf["pdf_dev_path"]
														
 
															+pdf_res_path = conf.conf["pdf_res_path"]
														
 
															+
														
 
															+class TestTable():
														
 
															+    """
														
 
															+    test table
														
 
															+    """
														
 
															+    def test_paddle_table_master_cuda(self):
														
 
															+        """
														
 
															+        select table: paddle table master,mode is cuda
														
 
															+        """
														
 
															+    def test_paddle_table_master_cpu(self):
														
 
															+        """
														
 
															+        select table: paddle table master, mode is cpu
														
 
															+        """
														
 
															+    def test_st_table_cuda(self):
														
 
															+        """
														
 
															+        select table: ST, mode is cuda 
														
 
															+        """
														
 
															+
														
 
															+    def test_st_table_cpu(self):
														
 
															+        """
														
 
															+        select table: ST, mode is cpu
														
 
															+        """
														
 
															+
														
 
															+    def test_close_table_cuda(self):
														
 
															+        """
														
 
															+        close table, mode is cuda
														
 
															+        """
														
 
															+    
														
 
															+
														
 
															+
														
 
															+
														
 
															+def get_score():
														
 
															+    """
														
 
															+    get score
														
 
															+    """
														
 
															+    score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
														
 
															+    score.calculate_similarity_total("mineru", pdf_dev_path)
														
 
															+    res = score.summary_scores()
														
 
															+    return res
														
 
															+
														
 
															+
														
--- a/tests/unittest/test_table/assets/table.jpg
+++ b/tests/unittest/test_table/assets/table.jpg
--- a/tests/unittest/test_table/test_tablemaster.py
+++ b/tests/unittest/test_table/test_tablemaster.py
@@ -4,10 +4,10 @@ from magic_pdf.model.ppTableModel import ppTableModel
 
															 class TestppTableModel(unittest.TestCase):
														
 
															     def test_image2html(self):
														
 
															-        img = Image.open("tests/test_table/assets/table.jpg")
														
 
															+        img = Image.open("tests/unittest/test_table/assets/table.jpg")
														
 
															         # 修改table模型路径
														
 
															         config = {"device": "cuda",
														
 
															-                  "model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
														
 
															+                  "model_dir": "/home/quyuan/PDF-Extract-Kit/models/TabRec/TableMaster"}
														
 
															         table_model = ppTableModel(config)
														
 
															         res = table_model.img2html(img)
														
 
															         true_value = """<td><table  border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink [26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink [4]</td><td>73.2</td><td>83.0</td><td>77.8</td><td>-</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2 </td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN[3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN [16]</td><td>79</td><td>88.</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td>-</td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td><td>-</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td><td>85.57</td><td>12.31</td></tr></tbody></table></td>\n"""