language.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import os
  2. import unicodedata
  3. if not os.getenv("FTLANG_CACHE"):
  4. current_file_path = os.path.abspath(__file__)
  5. current_dir = os.path.dirname(current_file_path)
  6. root_dir = os.path.dirname(current_dir)
  7. ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
  8. os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
  9. # print(os.getenv("FTLANG_CACHE"))
  10. from fast_langdetect import detect_language
  11. def detect_lang(text: str) -> str:
  12. if len(text) == 0:
  13. return ""
  14. try:
  15. lang_upper = detect_language(text)
  16. except:
  17. html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
  18. lang_upper = detect_language(html_no_ctrl_chars)
  19. try:
  20. lang = lang_upper.lower()
  21. except:
  22. lang = ""
  23. return lang
  24. if __name__ == '__main__':
  25. print(os.getenv("FTLANG_CACHE"))
  26. print(detect_lang("This is a test."))
  27. print(detect_lang("<html>This is a test</html>"))
  28. print(detect_lang("这个是中文测试。"))
  29. print(detect_lang("<html>这个是中文测试。</html>"))