guess_suffix_or_lang.py 731 B

1234567891011121314151617181920212223242526
  1. from pathlib import Path
  2. from magika import Magika
  3. DEFAULT_LANG = "txt"
  4. magika = Magika()
  5. def guess_language_by_text(code):
  6. codebytes = code.encode(encoding="utf-8")
  7. lang = magika.identify_bytes(codebytes).prediction.output.label
  8. return lang if lang != "unknown" else DEFAULT_LANG
  9. def guess_suffix_by_bytes(file_bytes) -> str:
  10. suffix = magika.identify_bytes(file_bytes).prediction.output.label
  11. return suffix
  12. def guess_suffix_by_path(file_path) -> str:
  13. if not isinstance(file_path, Path):
  14. file_path = Path(file_path)
  15. suffix = magika.identify_path(file_path).prediction.output.label
  16. if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
  17. suffix = "pdf"
  18. return suffix