| 123456789101112131415161718192021222324252627282930313233343536 |
- import pycld2 as cld2
- import regex
- import unicodedata
- RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
- def remove_bad_chars(text):
- return RE_BAD_CHARS.sub("", text)
- def detect_lang(text: str) -> str:
- if len(text) == 0:
- return ""
- try:
- _, _, details = cld2.detect(text)
- except:
- # cld2 doesn't like control characters
- # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
- html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
- _, _, details = cld2.detect(html_no_ctrl_chars)
- lang = ""
- try:
- lang = details[0][1].lower()
- except:
- lang = ""
- return lang
- if __name__ == '__main__':
- print(detect_lang("This is a test."))
- print(detect_lang("<html>This is a test</html>"))
- print(detect_lang("这个是中文测试。"))
- print(detect_lang("<html>这个是中文测试。</html>"))
|