language.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import os
  2. import unicodedata
  3. if not os.getenv("FTLANG_CACHE"):
  4. current_file_path = os.path.abspath(__file__)
  5. current_dir = os.path.dirname(current_file_path)
  6. root_dir = os.path.dirname(current_dir)
  7. ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
  8. os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
  9. # print(os.getenv("FTLANG_CACHE"))
  10. from fast_langdetect import detect_language
  11. def remove_invalid_surrogates(text):
  12. # 移除无效的 UTF-16 代理对
  13. return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
  14. def detect_lang(text: str) -> str:
  15. if len(text) == 0:
  16. return ""
  17. text = text.replace("\n", "")
  18. text = remove_invalid_surrogates(text)
  19. # print(text)
  20. try:
  21. lang_upper = detect_language(text)
  22. except:
  23. html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
  24. lang_upper = detect_language(html_no_ctrl_chars)
  25. try:
  26. lang = lang_upper.lower()
  27. except:
  28. lang = ""
  29. return lang
  30. if __name__ == '__main__':
  31. print(os.getenv("FTLANG_CACHE"))
  32. print(detect_lang("This is a test."))
  33. print(detect_lang("<html>This is a test</html>"))
  34. print(detect_lang("这个是中文测试。"))
  35. print(detect_lang("<html>这个是中文测试。</html>"))
  36. print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))