language.py 692 B

123456789101112131415161718192021222324
  1. import unicodedata
  2. from fast_langdetect import detect_language
  3. def detect_lang(text: str) -> str:
  4. if len(text) == 0:
  5. return ""
  6. try:
  7. lang_upper = detect_language(text)
  8. except:
  9. html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
  10. lang_upper = detect_language(html_no_ctrl_chars)
  11. try:
  12. lang = lang_upper.lower()
  13. except:
  14. lang = ""
  15. return lang
  16. if __name__ == '__main__':
  17. print(detect_lang("This is a test."))
  18. print(detect_lang("<html>This is a test</html>"))
  19. print(detect_lang("这个是中文测试。"))
  20. print(detect_lang("<html>这个是中文测试。</html>"))