1 éve · ce0d99057a
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
@@ -1,7 +1,6 @@
 
				-import pycld2 as cld2
			
 
				 import regex
			
 
				 import unicodedata
			
 
				-
			
 
				+from fast_langdetect import detect_langs
			
 
				 
			
 
				 RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
			
 
				 
			
@@ -13,17 +12,13 @@ def remove_bad_chars(text):
 
				 def detect_lang(text: str) -> str:
			
 
				     if len(text) == 0:
			
 
				         return ""
			
 
				-
			
 
				     try:
			
 
				-        _, _, details = cld2.detect(text)
			
 
				+        lang_upper = detect_langs(text)
			
 
				     except:
			
 
				-        # cld2 doesn't like control characters
			
 
				-        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
			
 
				-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
			
 
				-        _, _, details = cld2.detect(html_no_ctrl_chars)
			
 
				-    lang = ""
			
 
				+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
			
 
				+        lang_upper = detect_langs(html_no_ctrl_chars)
			
 
				     try:
			
 
				-        lang = details[0][1].lower()
			
 
				+        lang = lang_upper.lower()
			
 
				     except:
			
 
				         lang = ""
			
 
				     return lang
			
@@ -33,4 +28,4 @@ if __name__ == '__main__':
 
				     print(detect_lang("This is a test."))
			
 
				     print(detect_lang("<html>This is a test</html>"))
			
 
				     print(detect_lang("这个是中文测试。"))
			
 
				-    print(detect_lang("<html>这个是中文测试。</html>"))
			
 
				+    print(detect_lang("<html>这个是中文测试。</html>"))
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,13 +7,12 @@ loguru>=0.6.0
 
				 matplotlib>=3.8.3
			
 
				 numpy>=1.21.6
			
 
				 pandas>=1.3.5
			
 
				-pycld2>=0.41
			
 
				+fast-langdetect>=0.1.1
			
 
				 regex>=2023.12.25
			
 
				 termcolor>=2.4.0
			
 
				 wordninja>=2.0.0
			
 
				 scikit-learn>=1.0.2
			
 
				 nltk==3.8.1
			
 
				 s3pathlib>=2.1.1
			
 
				-pytest
			
 
				 paddlepaddle
			
 
				 paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl