Răsfoiți Sursa

Merge remote-tracking branch 'origin/master'

赵小蒙 1 an în urmă
părinte
comite
f856695c2f
5 a modificat fișierele cu 27 adăugiri și 21 ștergeri
  1. 21 16
      demo/demo.py
  2. 3 3
      magic_pdf/libs/language.py
  3. 1 1
      magic_pdf/libs/version.py
  4. 1 0
      requirements.txt
  5. 1 1
      setup.py

+ 21 - 16
demo/demo.py

@@ -1,22 +1,27 @@
 import os
 import json
 
+from loguru import logger
+
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 
-current_script_dir = os.path.dirname(os.path.abspath(__file__))
-demo_name = "demo1"
-pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
-model_path = os.path.join(current_script_dir, f"{demo_name}.json")
-pdf_bytes = open(pdf_path, "rb").read()
-model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
-jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-local_image_dir = os.path.join(current_script_dir, 'images')
-image_dir = str(os.path.basename(local_image_dir))
-image_writer = DiskReaderWriter(local_image_dir)
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
-    f.write(md_content)
+try:
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    demo_name = "demo1"
+    pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
+    model_path = os.path.join(current_script_dir, f"{demo_name}.json")
+    pdf_bytes = open(pdf_path, "rb").read()
+    model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
+    jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+    local_image_dir = os.path.join(current_script_dir, 'images')
+    image_dir = str(os.path.basename(local_image_dir))
+    image_writer = DiskReaderWriter(local_image_dir)
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+    with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+        f.write(md_content)
+except Exception as e:
+    logger.exception(e)

+ 3 - 3
magic_pdf/libs/language.py

@@ -1,15 +1,15 @@
 import unicodedata
-from fast_langdetect import detect_langs
+from fast_langdetect import detect_language
 
 
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
     try:
-        lang_upper = detect_langs(text)
+        lang_upper = detect_language(text)
     except:
         html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_langs(html_no_ctrl_chars)
+        lang_upper = detect_language(html_no_ctrl_chars)
     try:
         lang = lang_upper.lower()
     except:

+ 1 - 1
magic_pdf/libs/version.py

@@ -1 +1 @@
-__version__ = "0.5.12"
+__version__ = "0.5.13"

+ 1 - 0
requirements.txt

@@ -8,4 +8,5 @@ fast-langdetect>=0.1.1
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 pdfminer.six>=20231228
+numpy<2.0.0 #2.0版本与fasttext不兼容
 # requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员

+ 1 - 1
setup.py

@@ -35,7 +35,7 @@ if __name__ == '__main__':
         description="A practical tool for converting PDF to Markdown",  # 简短描述
         long_description=long_description,  # 详细描述
         long_description_content_type="text/markdown",  # 如果README是Markdown格式
-        url="https://github.com/magicpdf/Magic-PDF",
+        url="https://github.com/opendatalab/MinerU",
         python_requires=">=3.9",  # 项目依赖的 Python 版本
         entry_points={
             "console_scripts": [