solve_line_alien.py 853 B

1234567891011121314151617181920212223242526272829
  1. def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
  2. """解决行内文本间距过大问题"""
  3. for i in range(len(pdf_info_dict)):
  4. text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
  5. for block in text_blocks:
  6. x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
  7. for line in block['lines']:
  8. x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
  9. # line_box = [x1, y1, x2, y2]
  10. if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
  11. # if len(line['spans']) == 1:
  12. line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
  13. x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
  14. return pdf_info_dict