legacy.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from __future__ import annotations
  2. from typing import TYPE_CHECKING, Any
  3. from warnings import warn
  4. from .api import from_bytes
  5. from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
  6. # TODO: remove this check when dropping Python 3.7 support
  7. if TYPE_CHECKING:
  8. from typing_extensions import TypedDict
  9. class ResultDict(TypedDict):
  10. encoding: str | None
  11. language: str
  12. confidence: float | None
  13. def detect(
  14. byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
  15. ) -> ResultDict:
  16. """
  17. chardet legacy method
  18. Detect the encoding of the given byte string. It should be mostly backward-compatible.
  19. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
  20. This function is deprecated and should be used to migrate your project easily, consult the documentation for
  21. further information. Not planned for removal.
  22. :param byte_str: The byte sequence to examine.
  23. :param should_rename_legacy: Should we rename legacy encodings
  24. to their more modern equivalents?
  25. """
  26. if len(kwargs):
  27. warn(
  28. f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
  29. )
  30. if not isinstance(byte_str, (bytearray, bytes)):
  31. raise TypeError( # pragma: nocover
  32. f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
  33. )
  34. if isinstance(byte_str, bytearray):
  35. byte_str = bytes(byte_str)
  36. r = from_bytes(byte_str).best()
  37. encoding = r.encoding if r is not None else None
  38. language = r.language if r is not None and r.language != "Unknown" else ""
  39. confidence = 1.0 - r.chaos if r is not None else None
  40. # automatically lower confidence
  41. # on small bytes samples.
  42. # https://github.com/jawah/charset_normalizer/issues/391
  43. if (
  44. confidence is not None
  45. and confidence >= 0.9
  46. and encoding
  47. not in {
  48. "utf_8",
  49. "ascii",
  50. }
  51. and r.bom is False # type: ignore[union-attr]
  52. and len(byte_str) < TOO_SMALL_SEQUENCE
  53. ):
  54. confidence -= 0.2
  55. # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
  56. # but chardet does return 'utf-8-sig' and it is a valid codec name.
  57. if r is not None and encoding == "utf_8" and r.bom:
  58. encoding += "_sig"
  59. if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
  60. encoding = CHARDET_CORRESPONDENCE[encoding]
  61. return {
  62. "encoding": encoding,
  63. "language": language,
  64. "confidence": confidence,
  65. }