md.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635
  1. from __future__ import annotations
  2. from functools import lru_cache
  3. from logging import getLogger
  4. from .constant import (
  5. COMMON_SAFE_ASCII_CHARACTERS,
  6. TRACE,
  7. UNICODE_SECONDARY_RANGE_KEYWORD,
  8. )
  9. from .utils import (
  10. is_accentuated,
  11. is_arabic,
  12. is_arabic_isolated_form,
  13. is_case_variable,
  14. is_cjk,
  15. is_emoticon,
  16. is_hangul,
  17. is_hiragana,
  18. is_katakana,
  19. is_latin,
  20. is_punctuation,
  21. is_separator,
  22. is_symbol,
  23. is_thai,
  24. is_unprintable,
  25. remove_accent,
  26. unicode_range,
  27. is_cjk_uncommon,
  28. )
  29. class MessDetectorPlugin:
  30. """
  31. Base abstract class used for mess detection plugins.
  32. All detectors MUST extend and implement given methods.
  33. """
  34. def eligible(self, character: str) -> bool:
  35. """
  36. Determine if given character should be fed in.
  37. """
  38. raise NotImplementedError # pragma: nocover
  39. def feed(self, character: str) -> None:
  40. """
  41. The main routine to be executed upon character.
  42. Insert the logic in witch the text would be considered chaotic.
  43. """
  44. raise NotImplementedError # pragma: nocover
  45. def reset(self) -> None: # pragma: no cover
  46. """
  47. Permit to reset the plugin to the initial state.
  48. """
  49. raise NotImplementedError
  50. @property
  51. def ratio(self) -> float:
  52. """
  53. Compute the chaos ratio based on what your feed() has seen.
  54. Must NOT be lower than 0.; No restriction gt 0.
  55. """
  56. raise NotImplementedError # pragma: nocover
  57. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  58. def __init__(self) -> None:
  59. self._punctuation_count: int = 0
  60. self._symbol_count: int = 0
  61. self._character_count: int = 0
  62. self._last_printable_char: str | None = None
  63. self._frenzy_symbol_in_word: bool = False
  64. def eligible(self, character: str) -> bool:
  65. return character.isprintable()
  66. def feed(self, character: str) -> None:
  67. self._character_count += 1
  68. if (
  69. character != self._last_printable_char
  70. and character not in COMMON_SAFE_ASCII_CHARACTERS
  71. ):
  72. if is_punctuation(character):
  73. self._punctuation_count += 1
  74. elif (
  75. character.isdigit() is False
  76. and is_symbol(character)
  77. and is_emoticon(character) is False
  78. ):
  79. self._symbol_count += 2
  80. self._last_printable_char = character
  81. def reset(self) -> None: # Abstract
  82. self._punctuation_count = 0
  83. self._character_count = 0
  84. self._symbol_count = 0
  85. @property
  86. def ratio(self) -> float:
  87. if self._character_count == 0:
  88. return 0.0
  89. ratio_of_punctuation: float = (
  90. self._punctuation_count + self._symbol_count
  91. ) / self._character_count
  92. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  93. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  94. def __init__(self) -> None:
  95. self._character_count: int = 0
  96. self._accentuated_count: int = 0
  97. def eligible(self, character: str) -> bool:
  98. return character.isalpha()
  99. def feed(self, character: str) -> None:
  100. self._character_count += 1
  101. if is_accentuated(character):
  102. self._accentuated_count += 1
  103. def reset(self) -> None: # Abstract
  104. self._character_count = 0
  105. self._accentuated_count = 0
  106. @property
  107. def ratio(self) -> float:
  108. if self._character_count < 8:
  109. return 0.0
  110. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  111. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  112. class UnprintablePlugin(MessDetectorPlugin):
  113. def __init__(self) -> None:
  114. self._unprintable_count: int = 0
  115. self._character_count: int = 0
  116. def eligible(self, character: str) -> bool:
  117. return True
  118. def feed(self, character: str) -> None:
  119. if is_unprintable(character):
  120. self._unprintable_count += 1
  121. self._character_count += 1
  122. def reset(self) -> None: # Abstract
  123. self._unprintable_count = 0
  124. @property
  125. def ratio(self) -> float:
  126. if self._character_count == 0:
  127. return 0.0
  128. return (self._unprintable_count * 8) / self._character_count
  129. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  130. def __init__(self) -> None:
  131. self._successive_count: int = 0
  132. self._character_count: int = 0
  133. self._last_latin_character: str | None = None
  134. def eligible(self, character: str) -> bool:
  135. return character.isalpha() and is_latin(character)
  136. def feed(self, character: str) -> None:
  137. self._character_count += 1
  138. if (
  139. self._last_latin_character is not None
  140. and is_accentuated(character)
  141. and is_accentuated(self._last_latin_character)
  142. ):
  143. if character.isupper() and self._last_latin_character.isupper():
  144. self._successive_count += 1
  145. # Worse if its the same char duplicated with different accent.
  146. if remove_accent(character) == remove_accent(self._last_latin_character):
  147. self._successive_count += 1
  148. self._last_latin_character = character
  149. def reset(self) -> None: # Abstract
  150. self._successive_count = 0
  151. self._character_count = 0
  152. self._last_latin_character = None
  153. @property
  154. def ratio(self) -> float:
  155. if self._character_count == 0:
  156. return 0.0
  157. return (self._successive_count * 2) / self._character_count
  158. class SuspiciousRange(MessDetectorPlugin):
  159. def __init__(self) -> None:
  160. self._suspicious_successive_range_count: int = 0
  161. self._character_count: int = 0
  162. self._last_printable_seen: str | None = None
  163. def eligible(self, character: str) -> bool:
  164. return character.isprintable()
  165. def feed(self, character: str) -> None:
  166. self._character_count += 1
  167. if (
  168. character.isspace()
  169. or is_punctuation(character)
  170. or character in COMMON_SAFE_ASCII_CHARACTERS
  171. ):
  172. self._last_printable_seen = None
  173. return
  174. if self._last_printable_seen is None:
  175. self._last_printable_seen = character
  176. return
  177. unicode_range_a: str | None = unicode_range(self._last_printable_seen)
  178. unicode_range_b: str | None = unicode_range(character)
  179. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  180. self._suspicious_successive_range_count += 1
  181. self._last_printable_seen = character
  182. def reset(self) -> None: # Abstract
  183. self._character_count = 0
  184. self._suspicious_successive_range_count = 0
  185. self._last_printable_seen = None
  186. @property
  187. def ratio(self) -> float:
  188. if self._character_count <= 13:
  189. return 0.0
  190. ratio_of_suspicious_range_usage: float = (
  191. self._suspicious_successive_range_count * 2
  192. ) / self._character_count
  193. return ratio_of_suspicious_range_usage
  194. class SuperWeirdWordPlugin(MessDetectorPlugin):
  195. def __init__(self) -> None:
  196. self._word_count: int = 0
  197. self._bad_word_count: int = 0
  198. self._foreign_long_count: int = 0
  199. self._is_current_word_bad: bool = False
  200. self._foreign_long_watch: bool = False
  201. self._character_count: int = 0
  202. self._bad_character_count: int = 0
  203. self._buffer: str = ""
  204. self._buffer_accent_count: int = 0
  205. self._buffer_glyph_count: int = 0
  206. def eligible(self, character: str) -> bool:
  207. return True
  208. def feed(self, character: str) -> None:
  209. if character.isalpha():
  210. self._buffer += character
  211. if is_accentuated(character):
  212. self._buffer_accent_count += 1
  213. if (
  214. self._foreign_long_watch is False
  215. and (is_latin(character) is False or is_accentuated(character))
  216. and is_cjk(character) is False
  217. and is_hangul(character) is False
  218. and is_katakana(character) is False
  219. and is_hiragana(character) is False
  220. and is_thai(character) is False
  221. ):
  222. self._foreign_long_watch = True
  223. if (
  224. is_cjk(character)
  225. or is_hangul(character)
  226. or is_katakana(character)
  227. or is_hiragana(character)
  228. or is_thai(character)
  229. ):
  230. self._buffer_glyph_count += 1
  231. return
  232. if not self._buffer:
  233. return
  234. if (
  235. character.isspace() or is_punctuation(character) or is_separator(character)
  236. ) and self._buffer:
  237. self._word_count += 1
  238. buffer_length: int = len(self._buffer)
  239. self._character_count += buffer_length
  240. if buffer_length >= 4:
  241. if self._buffer_accent_count / buffer_length >= 0.5:
  242. self._is_current_word_bad = True
  243. # Word/Buffer ending with an upper case accentuated letter are so rare,
  244. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  245. elif (
  246. is_accentuated(self._buffer[-1])
  247. and self._buffer[-1].isupper()
  248. and all(_.isupper() for _ in self._buffer) is False
  249. ):
  250. self._foreign_long_count += 1
  251. self._is_current_word_bad = True
  252. elif self._buffer_glyph_count == 1:
  253. self._is_current_word_bad = True
  254. self._foreign_long_count += 1
  255. if buffer_length >= 24 and self._foreign_long_watch:
  256. camel_case_dst = [
  257. i
  258. for c, i in zip(self._buffer, range(0, buffer_length))
  259. if c.isupper()
  260. ]
  261. probable_camel_cased: bool = False
  262. if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
  263. probable_camel_cased = True
  264. if not probable_camel_cased:
  265. self._foreign_long_count += 1
  266. self._is_current_word_bad = True
  267. if self._is_current_word_bad:
  268. self._bad_word_count += 1
  269. self._bad_character_count += len(self._buffer)
  270. self._is_current_word_bad = False
  271. self._foreign_long_watch = False
  272. self._buffer = ""
  273. self._buffer_accent_count = 0
  274. self._buffer_glyph_count = 0
  275. elif (
  276. character not in {"<", ">", "-", "=", "~", "|", "_"}
  277. and character.isdigit() is False
  278. and is_symbol(character)
  279. ):
  280. self._is_current_word_bad = True
  281. self._buffer += character
  282. def reset(self) -> None: # Abstract
  283. self._buffer = ""
  284. self._is_current_word_bad = False
  285. self._foreign_long_watch = False
  286. self._bad_word_count = 0
  287. self._word_count = 0
  288. self._character_count = 0
  289. self._bad_character_count = 0
  290. self._foreign_long_count = 0
  291. @property
  292. def ratio(self) -> float:
  293. if self._word_count <= 10 and self._foreign_long_count == 0:
  294. return 0.0
  295. return self._bad_character_count / self._character_count
  296. class CjkUncommonPlugin(MessDetectorPlugin):
  297. """
  298. Detect messy CJK text that probably means nothing.
  299. """
  300. def __init__(self) -> None:
  301. self._character_count: int = 0
  302. self._uncommon_count: int = 0
  303. def eligible(self, character: str) -> bool:
  304. return is_cjk(character)
  305. def feed(self, character: str) -> None:
  306. self._character_count += 1
  307. if is_cjk_uncommon(character):
  308. self._uncommon_count += 1
  309. return
  310. def reset(self) -> None: # Abstract
  311. self._character_count = 0
  312. self._uncommon_count = 0
  313. @property
  314. def ratio(self) -> float:
  315. if self._character_count < 8:
  316. return 0.0
  317. uncommon_form_usage: float = self._uncommon_count / self._character_count
  318. # we can be pretty sure it's garbage when uncommon characters are widely
  319. # used. otherwise it could just be traditional chinese for example.
  320. return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
  321. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  322. def __init__(self) -> None:
  323. self._buf: bool = False
  324. self._character_count_since_last_sep: int = 0
  325. self._successive_upper_lower_count: int = 0
  326. self._successive_upper_lower_count_final: int = 0
  327. self._character_count: int = 0
  328. self._last_alpha_seen: str | None = None
  329. self._current_ascii_only: bool = True
  330. def eligible(self, character: str) -> bool:
  331. return True
  332. def feed(self, character: str) -> None:
  333. is_concerned = character.isalpha() and is_case_variable(character)
  334. chunk_sep = is_concerned is False
  335. if chunk_sep and self._character_count_since_last_sep > 0:
  336. if (
  337. self._character_count_since_last_sep <= 64
  338. and character.isdigit() is False
  339. and self._current_ascii_only is False
  340. ):
  341. self._successive_upper_lower_count_final += (
  342. self._successive_upper_lower_count
  343. )
  344. self._successive_upper_lower_count = 0
  345. self._character_count_since_last_sep = 0
  346. self._last_alpha_seen = None
  347. self._buf = False
  348. self._character_count += 1
  349. self._current_ascii_only = True
  350. return
  351. if self._current_ascii_only is True and character.isascii() is False:
  352. self._current_ascii_only = False
  353. if self._last_alpha_seen is not None:
  354. if (character.isupper() and self._last_alpha_seen.islower()) or (
  355. character.islower() and self._last_alpha_seen.isupper()
  356. ):
  357. if self._buf is True:
  358. self._successive_upper_lower_count += 2
  359. self._buf = False
  360. else:
  361. self._buf = True
  362. else:
  363. self._buf = False
  364. self._character_count += 1
  365. self._character_count_since_last_sep += 1
  366. self._last_alpha_seen = character
  367. def reset(self) -> None: # Abstract
  368. self._character_count = 0
  369. self._character_count_since_last_sep = 0
  370. self._successive_upper_lower_count = 0
  371. self._successive_upper_lower_count_final = 0
  372. self._last_alpha_seen = None
  373. self._buf = False
  374. self._current_ascii_only = True
  375. @property
  376. def ratio(self) -> float:
  377. if self._character_count == 0:
  378. return 0.0
  379. return self._successive_upper_lower_count_final / self._character_count
  380. class ArabicIsolatedFormPlugin(MessDetectorPlugin):
  381. def __init__(self) -> None:
  382. self._character_count: int = 0
  383. self._isolated_form_count: int = 0
  384. def reset(self) -> None: # Abstract
  385. self._character_count = 0
  386. self._isolated_form_count = 0
  387. def eligible(self, character: str) -> bool:
  388. return is_arabic(character)
  389. def feed(self, character: str) -> None:
  390. self._character_count += 1
  391. if is_arabic_isolated_form(character):
  392. self._isolated_form_count += 1
  393. @property
  394. def ratio(self) -> float:
  395. if self._character_count < 8:
  396. return 0.0
  397. isolated_form_usage: float = self._isolated_form_count / self._character_count
  398. return isolated_form_usage
  399. @lru_cache(maxsize=1024)
  400. def is_suspiciously_successive_range(
  401. unicode_range_a: str | None, unicode_range_b: str | None
  402. ) -> bool:
  403. """
  404. Determine if two Unicode range seen next to each other can be considered as suspicious.
  405. """
  406. if unicode_range_a is None or unicode_range_b is None:
  407. return True
  408. if unicode_range_a == unicode_range_b:
  409. return False
  410. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  411. return False
  412. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  413. return False
  414. # Latin characters can be accompanied with a combining diacritical mark
  415. # eg. Vietnamese.
  416. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  417. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  418. ):
  419. return False
  420. keywords_range_a, keywords_range_b = (
  421. unicode_range_a.split(" "),
  422. unicode_range_b.split(" "),
  423. )
  424. for el in keywords_range_a:
  425. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  426. continue
  427. if el in keywords_range_b:
  428. return False
  429. # Japanese Exception
  430. range_a_jp_chars, range_b_jp_chars = (
  431. unicode_range_a
  432. in (
  433. "Hiragana",
  434. "Katakana",
  435. ),
  436. unicode_range_b in ("Hiragana", "Katakana"),
  437. )
  438. if (range_a_jp_chars or range_b_jp_chars) and (
  439. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  440. ):
  441. return False
  442. if range_a_jp_chars and range_b_jp_chars:
  443. return False
  444. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  445. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  446. return False
  447. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  448. return False
  449. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  450. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  451. unicode_range_a in ["Katakana", "Hiragana"]
  452. and unicode_range_b in ["Katakana", "Hiragana"]
  453. ):
  454. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  455. return False
  456. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  457. return False
  458. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  459. return False
  460. return True
  461. @lru_cache(maxsize=2048)
  462. def mess_ratio(
  463. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  464. ) -> float:
  465. """
  466. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  467. """
  468. detectors: list[MessDetectorPlugin] = [
  469. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  470. ]
  471. length: int = len(decoded_sequence) + 1
  472. mean_mess_ratio: float = 0.0
  473. if length < 512:
  474. intermediary_mean_mess_ratio_calc: int = 32
  475. elif length <= 1024:
  476. intermediary_mean_mess_ratio_calc = 64
  477. else:
  478. intermediary_mean_mess_ratio_calc = 128
  479. for character, index in zip(decoded_sequence + "\n", range(length)):
  480. for detector in detectors:
  481. if detector.eligible(character):
  482. detector.feed(character)
  483. if (
  484. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  485. ) or index == length - 1:
  486. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  487. if mean_mess_ratio >= maximum_threshold:
  488. break
  489. if debug:
  490. logger = getLogger("charset_normalizer")
  491. logger.log(
  492. TRACE,
  493. "Mess-detector extended-analysis start. "
  494. f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
  495. f"maximum_threshold={maximum_threshold}",
  496. )
  497. if len(decoded_sequence) > 16:
  498. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  499. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  500. for dt in detectors:
  501. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  502. return round(mean_mess_ratio, 3)