language.py 969 B

123456789101112131415161718192021222324252627282930313233343536
  1. import pycld2 as cld2
  2. import regex
  3. import unicodedata
  4. RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
  5. def remove_bad_chars(text):
  6. return RE_BAD_CHARS.sub("", text)
  7. def detect_lang(text: str) -> str:
  8. if len(text) == 0:
  9. return ""
  10. try:
  11. _, _, details = cld2.detect(text)
  12. except:
  13. # cld2 doesn't like control characters
  14. # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
  15. html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
  16. _, _, details = cld2.detect(html_no_ctrl_chars)
  17. lang = ""
  18. try:
  19. lang = details[0][1].lower()
  20. except:
  21. lang = ""
  22. return lang
  23. if __name__ == '__main__':
  24. print(detect_lang("This is a test."))
  25. print(detect_lang("<html>This is a test</html>"))
  26. print(detect_lang("这个是中文测试。"))
  27. print(detect_lang("<html>这个是中文测试。</html>"))