clean_photo.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import pypandoc
  2. import re
  3. import htmltabletomd
  4. import os
  5. import argparse
  6. import zipfile
  7. parser = argparse.ArgumentParser(description="get tool type")
  8. parser.add_argument(
  9. "--tool_name",
  10. type=str,
  11. required=True,
  12. help="input tool name",
  13. )
  14. parser.add_argument(
  15. "--download_dir",
  16. type=str,
  17. required=True,
  18. help="input download dir",
  19. )
  20. args = parser.parse_args()
  21. def clean_markdown_images(content):
  22. pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
  23. cleaned_content = pattern.sub('', content)
  24. return cleaned_content
  25. def clean_ocrmath_photo(content):
  26. pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
  27. cleaned_content = pattern.sub('', content)
  28. return cleaned_content
  29. def convert_html_table_to_md(html_table):
  30. lines = html_table.strip().split('\n')
  31. md_table = ''
  32. if lines and '<tr>' in lines[0]:
  33. in_thead = True
  34. for line in lines:
  35. if '<th>' in line:
  36. cells = re.findall(r'<th>(.*?)</th>', line)
  37. md_table += '| ' + ' | '.join(cells) + ' |\n'
  38. in_thead = False
  39. elif '<td>' in line and not in_thead:
  40. cells = re.findall(r'<td>(.*?)</td>', line)
  41. md_table += '| ' + ' | '.join(cells) + ' |\n'
  42. md_table = md_table.rstrip() + '\n'
  43. return md_table
  44. def convert_latext_to_md(content):
  45. tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
  46. placeholders = []
  47. for table in tables:
  48. placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
  49. replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
  50. content = content.replace(replace_str, placeholder)
  51. try:
  52. pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
  53. except:
  54. markdown_string = replace_str
  55. else:
  56. markdown_string = open('output.md', 'r', encoding='utf-8').read()
  57. placeholders.append((placeholder, markdown_string))
  58. new_content = content
  59. for placeholder, md_table in placeholders:
  60. new_content = new_content.replace(placeholder, md_table)
  61. # 写入文件
  62. return new_content
  63. def convert_htmltale_to_md(content):
  64. tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
  65. placeholders = []
  66. for table in tables:
  67. placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
  68. content = content.replace(f"<table>{table}</table>", placeholder)
  69. try:
  70. convert_table = htmltabletomd.convert_table(table)
  71. except:
  72. convert_table = table
  73. placeholders.append((placeholder,convert_table))
  74. new_content = content
  75. for placeholder, md_table in placeholders:
  76. new_content = new_content.replace(placeholder, md_table)
  77. # 写入文件
  78. return new_content
  79. def clean_data(prod_type, download_dir):
  80. file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
  81. for filetype in file_type:
  82. tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned")
  83. if not os.path.exists(tgt_dir):
  84. os.makedirs(tgt_dir)
  85. source_dir = os.path.join(download_dir, filetype, prod_type)
  86. filenames = os.listdir(source_dir)
  87. for filename in filenames:
  88. if filename.endswith('.md'):
  89. input_file = os.path.join(source_dir, filename)
  90. output_file = os.path.join(tgt_dir, "cleaned_" + filename)
  91. with open(input_file, 'r', encoding='utf-8') as fr:
  92. content = fr.read()
  93. new_content = convert_htmltale_to_md(content)
  94. new_content = clean_markdown_images(new_content)
  95. new_content = clean_ocrmath_photo(new_content)
  96. new_content = convert_latext_to_md(new_content)
  97. with open(output_file, 'w', encoding='utf-8') as fw:
  98. fw.write(new_content)
  99. if __name__ == '__main__':
  100. tool_type = args.tool_name
  101. download_dir = args.download_dir
  102. clean_data(tool_type, download_dir)