guess_suffix_or_lang.py 856 B

12345678910111213141516171819202122232425262728
  1. from pathlib import Path
  2. from magika import Magika
  3. DEFAULT_LANG = "txt"
  4. magika = Magika()
  5. def guess_language_by_text(code):
  6. codebytes = code.encode(encoding="utf-8")
  7. lang = magika.identify_bytes(codebytes).prediction.output.label
  8. return lang if lang != "unknown" else DEFAULT_LANG
  9. def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
  10. suffix = magika.identify_bytes(file_bytes).prediction.output.label
  11. if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
  12. suffix = "pdf"
  13. return suffix
  14. def guess_suffix_by_path(file_path) -> str:
  15. if not isinstance(file_path, Path):
  16. file_path = Path(file_path)
  17. suffix = magika.identify_path(file_path).prediction.output.label
  18. if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
  19. suffix = "pdf"
  20. return suffix