construct_page_dict.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
  2. interline_eq_info, raw_pymu_blocks,
  3. removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
  4. layout_tree,
  5. page_w, page_h, footnote_bboxes_tmp):
  6. """
  7. """
  8. return_dict = {}
  9. return_dict['para_blocks'] = {}
  10. return_dict['preproc_blocks'] = text_blocks_preproc
  11. return_dict['images'] = image_info
  12. return_dict['tables'] = table_info
  13. return_dict['interline_equations'] = interline_eq_info
  14. return_dict['inline_equations'] = inline_eq_info
  15. return_dict['layout_bboxes'] = layout_bboxes
  16. return_dict['pymu_raw_blocks'] = raw_pymu_blocks
  17. return_dict['global_statistic'] = {}
  18. return_dict['droped_text_block'] = removed_text_blocks
  19. return_dict['droped_image_block'] = removed_image_blocks
  20. return_dict['droped_table_block'] = []
  21. return_dict['image_backup'] = images_backup
  22. return_dict['table_backup'] = []
  23. return_dict['page_idx'] = page_id
  24. return_dict['page_size'] = [page_w, page_h]
  25. return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
  26. return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
  27. return return_dict
  28. def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
  29. images, tables, interline_equations, inline_equations,
  30. dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
  31. need_remove_spans_bboxes_dict):
  32. return_dict = {
  33. 'preproc_blocks': blocks,
  34. 'layout_bboxes': layout_bboxes,
  35. 'page_idx': page_id,
  36. 'page_size': [page_w, page_h],
  37. '_layout_tree': layout_tree,
  38. 'images': images,
  39. 'tables': tables,
  40. 'interline_equations': interline_equations,
  41. 'inline_equations': inline_equations,
  42. 'droped_text_block': dropped_text_block,
  43. 'droped_image_block': dropped_image_block,
  44. 'droped_table_block': dropped_table_block,
  45. 'dropped_equation_block': dropped_equation_block,
  46. 'droped_bboxes': need_remove_spans_bboxes_dict,
  47. }
  48. return return_dict
  49. def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
  50. images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
  51. return_dict = {
  52. 'preproc_blocks': blocks,
  53. 'layout_bboxes': layout_bboxes,
  54. 'page_idx': page_id,
  55. 'page_size': [page_w, page_h],
  56. '_layout_tree': layout_tree,
  57. 'images': images,
  58. 'tables': tables,
  59. 'interline_equations': interline_equations,
  60. 'discarded_blocks': discarded_blocks,
  61. 'need_drop': need_drop,
  62. 'drop_reason': drop_reason,
  63. }
  64. return return_dict