pre_clean.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. """
  2. clean data
  3. """
  4. import argparse
  5. import os
  6. import re
  7. import htmltabletomd # type: ignore
  8. import pypandoc
  9. import argparse
  10. parser = argparse.ArgumentParser(description="get tool type")
  11. parser.add_argument(
  12. "--tool_name",
  13. type=str,
  14. required=True,
  15. help="input tool name",
  16. )
  17. parser.add_argument(
  18. "--download_dir",
  19. type=str,
  20. required=True,
  21. help="input download dir",
  22. )
  23. args = parser.parse_args()
  24. def clean_markdown_images(content):
  25. """
  26. clean markdown images
  27. """
  28. pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
  29. cleaned_content = pattern.sub('', content)
  30. return cleaned_content
  31. def clean_ocrmath_photo(content):
  32. """
  33. clean ocrmath photo
  34. """
  35. pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
  36. cleaned_content = pattern.sub('', content)
  37. return cleaned_content
  38. def convert_html_table_to_md(html_table):
  39. """
  40. convert html table to markdown table
  41. """
  42. lines = html_table.strip().split('\n')
  43. md_table = ''
  44. if lines and '<tr>' in lines[0]:
  45. in_thead = True
  46. for line in lines:
  47. if '<th>' in line:
  48. cells = re.findall(r'<th>(.*?)</th>', line)
  49. md_table += '| ' + ' | '.join(cells) + ' |\n'
  50. in_thead = False
  51. elif '<td>' in line and not in_thead:
  52. cells = re.findall(r'<td>(.*?)</td>', line)
  53. md_table += '| ' + ' | '.join(cells) + ' |\n'
  54. md_table = md_table.rstrip() + '\n'
  55. return md_table
  56. def convert_latext_to_md(content):
  57. """
  58. convert latex table to markdown table
  59. """
  60. tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
  61. placeholders = []
  62. for table in tables:
  63. placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
  64. replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
  65. content = content.replace(replace_str, placeholder)
  66. try:
  67. pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
  68. except:
  69. markdown_string = replace_str
  70. else:
  71. markdown_string = open('output.md', 'r', encoding='utf-8').read()
  72. placeholders.append((placeholder, markdown_string))
  73. new_content = content
  74. for placeholder, md_table in placeholders:
  75. new_content = new_content.replace(placeholder, md_table)
  76. # 写入文件
  77. return new_content
  78. def convert_htmltale_to_md(content):
  79. """
  80. convert html table to markdown table
  81. """
  82. tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
  83. placeholders = []
  84. for table in tables:
  85. placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
  86. content = content.replace(f"<table>{table}</table>", placeholder)
  87. try:
  88. convert_table = htmltabletomd.convert_table(table)
  89. except:
  90. convert_table = table
  91. placeholders.append((placeholder,convert_table))
  92. new_content = content
  93. for placeholder, md_table in placeholders:
  94. new_content = new_content.replace(placeholder, md_table)
  95. # 写入文件
  96. return new_content
  97. def clean_data(prod_type, download_dir):
  98. """
  99. clean data
  100. """
  101. tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
  102. if not os.path.exists(tgt_dir):
  103. os.makedirs(tgt_dir)
  104. source_dir = os.path.join(download_dir, prod_type)
  105. filenames = os.listdir(source_dir)
  106. for filename in filenames:
  107. if filename.endswith('.md'):
  108. input_file = os.path.join(source_dir, filename)
  109. output_file = os.path.join(tgt_dir, "cleaned_" + filename)
  110. with open(input_file, 'r', encoding='utf-8') as fr:
  111. content = fr.read()
  112. new_content = clean_markdown_images(content)
  113. with open(output_file, 'w', encoding='utf-8') as fw:
  114. fw.write(new_content)
  115. if __name__ == '__main__':
  116. tool_type = args.tool_name
  117. download_dir = args.download_dir
  118. clean_data(tool_type, download_dir)