main_text_font.py 1.0 KB

1234567891011121314151617181920212223
  1. import collections
  2. def get_main_text_font(pdf_docs):
  3. font_names = collections.Counter()
  4. for page in pdf_docs:
  5. blocks = page.get_text('dict')['blocks']
  6. if blocks is not None:
  7. for block in blocks:
  8. lines = block.get('lines')
  9. if lines is not None:
  10. for line in lines:
  11. span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
  12. 'font' in span and len(span['text']) > 0]
  13. if span_font:
  14. # main_text_font应该用基于字数最多的字体而不是span级别的统计
  15. # font_names.append(font_name for font_name in span_font)
  16. # block_fonts.append(font_name for font_name in span_font)
  17. for font, count in span_font:
  18. font_names[font] += count
  19. main_text_font = font_names.most_common(1)[0][0]
  20. return main_text_font