pdf_parse_by_txt_v2.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. from magic_pdf.pdf_parse_union_core import pdf_parse_union
  2. def parse_pdf_by_txt(
  3. pdf_bytes,
  4. model_list,
  5. imageWriter,
  6. start_page_id=0,
  7. end_page_id=None,
  8. debug_mode=False,
  9. ):
  10. return pdf_parse_union(pdf_bytes,
  11. model_list,
  12. imageWriter,
  13. "txt",
  14. start_page_id=start_page_id,
  15. end_page_id=end_page_id,
  16. debug_mode=debug_mode,
  17. )
  18. if __name__ == "__main__":
  19. pass
  20. # if 1:
  21. # import fitz
  22. # import json
  23. #
  24. # with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
  25. # pdf_bytes = f.read()
  26. # pdf_docs = fitz.open("pdf", pdf_bytes)
  27. #
  28. # with open("/opt/data/pdf/20240418/25536-00.json") as f:
  29. # model_list = json.loads(f.readline())
  30. #
  31. # magic_model = MagicModel(model_list, pdf_docs)
  32. # for i in range(7):
  33. # print(magic_model.get_imgs(i))
  34. #
  35. # for page_no, page in enumerate(pdf_docs):
  36. # inline_equations, interline_equations, interline_equation_blocks = (
  37. # magic_model.get_equations(page_no)
  38. # )
  39. #
  40. # text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
  41. # char_level_text_blocks = page.get_text(
  42. # "rawdict", flags=fitz.TEXTFLAGS_TEXT
  43. # )["blocks"]
  44. # text_blocks = combine_chars_to_pymudict(
  45. # text_raw_blocks, char_level_text_blocks
  46. # )
  47. # text_blocks = replace_equations_in_textblock(
  48. # text_blocks, inline_equations, interline_equations
  49. # )
  50. # text_blocks = remove_citation_marker(text_blocks)
  51. #
  52. # text_blocks = remove_chars_in_text_blocks(text_blocks)