浏览代码

update ci timeout

quyuan 1 年之前
父节点
当前提交
b3c6d67684
共有 2 个文件被更改,包括 25 次插入9 次删除
  1. 15 1
      requirements.txt
  2. 10 8
      tools/benchmark.py

+ 15 - 1
requirements.txt

@@ -14,4 +14,18 @@ wordninja>=2.0.0
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1
-pdfminer.six>=20231228
+pdfminer.six>=20231228
+Levenshtein
+nltk
+rapidfuzz
+statistics
+openxlab #安装opendatalab
+pandas
+numpy
+matplotlib
+seaborn
+scipy
+scikit-learn
+tqdm
+htmltabletomd
+pypandoc

+ 10 - 8
tools/benchmark.py

@@ -5,18 +5,20 @@ code_path = os.environ.get('GITHUB_WORKSPACE')
 pdf_dev_path = "/home/quyuan/data"
 pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf"
 def test_cli():
-    cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
+    magicpdf_path = os.path.join(pdf_dev_path, "output")
+    if not os.path.exists(magicpdf_path):
+        os.makedirs(magicpdf_path)
+    cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, magicpdf_path)
     os.system(cmd)
-    if not os.path.exists(os.path.join(pdf_dev_path, "output")):
-        os.makedirs(os.path.join(pdf_dev_path, "output"))
-    for annotaion_name in os.listdir(os.path.join(pdf_dev_path, "output")):
-        if annotaion_name.endswith('.pdf'):
+   
+    for annotaion_name in os.walk(os.path.join(pdf_dev_path, "ci")):
+        if annotaion_name.endswith('.md'):
             for pdf_res_path  in os.listdir(pdf_res_path):
-                if ".md" in os.path.join(pdf_res_path, annotaion_name, "auto"):
+                if annotaion_name in os.path.join(pdf_res_path, annotaion_name, "auto"):
                     prefix = annotaion_name.split('_')[-2]
                     if not os.path.exists(os.join(pdf_dev_path, prefix)):
-                        os.makedirs(os.path.join(pdf_dev_path, prefix))
-                        shutil.copy(os.path.join(pdf_res_path, annotaion_name, "auto", annotaion_name + ".md"), os.join(pdf_dev_path, prefix, annotaion_name + ".md"))
+                        #os.makedirs(os.path.join(pdf_dev_path, prefix))
+                        shutil.copy(os.path.join(pdf_res_path, annotaion_name.strip(".md"), "auto", annotaion_name), os.join(pdf_dev_path, "ci", prefix, annotaion_name))
                    
 
 def calculate_score():