detect_language_from_model.py 816 B

123456789101112131415161718192021
  1. from collections import Counter
  2. from magic_pdf.libs.language import detect_lang
  3. def get_language_from_model(model_list: list):
  4. language_lst = []
  5. for ocr_page_info in model_list:
  6. page_text = ""
  7. layout_dets = ocr_page_info["layout_dets"]
  8. for layout_det in layout_dets:
  9. category_id = layout_det["category_id"]
  10. allow_category_id_list = [15]
  11. if category_id in allow_category_id_list:
  12. page_text += layout_det["text"]
  13. page_language = detect_lang(page_text)
  14. language_lst.append(page_language)
  15. # 统计text_language_list中每种语言的个数
  16. count_dict = Counter(language_lst)
  17. # 输出text_language_list中出现的次数最多的语言
  18. language = max(count_dict, key=count_dict.get)
  19. return language