Forráskód Böngészése

use fast_langdetect replace cld2

赵小蒙 1 éve
szülő
commit
ce0d99057a
2 módosított fájl, 7 hozzáadás és 13 törlés
  1. 6 11
      magic_pdf/libs/language.py
  2. 1 2
      requirements.txt

+ 6 - 11
magic_pdf/libs/language.py

@@ -1,7 +1,6 @@
-import pycld2 as cld2
 import regex
 import unicodedata
-
+from fast_langdetect import detect_langs
 
 RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
 
@@ -13,17 +12,13 @@ def remove_bad_chars(text):
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
-
     try:
-        _, _, details = cld2.detect(text)
+        lang_upper = detect_langs(text)
     except:
-        # cld2 doesn't like control characters
-        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
-        _, _, details = cld2.detect(html_no_ctrl_chars)
-    lang = ""
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_langs(html_no_ctrl_chars)
     try:
-        lang = details[0][1].lower()
+        lang = lang_upper.lower()
     except:
         lang = ""
     return lang
@@ -33,4 +28,4 @@ if __name__ == '__main__':
     print(detect_lang("This is a test."))
     print(detect_lang("<html>This is a test</html>"))
     print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
+    print(detect_lang("<html>这个是中文测试。</html>"))

+ 1 - 2
requirements.txt

@@ -7,13 +7,12 @@ loguru>=0.6.0
 matplotlib>=3.8.3
 numpy>=1.21.6
 pandas>=1.3.5
-pycld2>=0.41
+fast-langdetect>=0.1.1
 regex>=2023.12.25
 termcolor>=2.4.0
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1
-pytest
 paddlepaddle
 paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl