language.py 812 B

12345678910111213141516171819202122232425262728293031
  1. import regex
  2. import unicodedata
  3. from fast_langdetect import detect_langs
  4. RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
  5. def remove_bad_chars(text):
  6. return RE_BAD_CHARS.sub("", text)
  7. def detect_lang(text: str) -> str:
  8. if len(text) == 0:
  9. return ""
  10. try:
  11. lang_upper = detect_langs(text)
  12. except:
  13. html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
  14. lang_upper = detect_langs(html_no_ctrl_chars)
  15. try:
  16. lang = lang_upper.lower()
  17. except:
  18. lang = ""
  19. return lang
  20. if __name__ == '__main__':
  21. print(detect_lang("This is a test."))
  22. print(detect_lang("<html>This is a test</html>"))
  23. print(detect_lang("这个是中文测试。"))
  24. print(detect_lang("<html>这个是中文测试。</html>"))