content_extractor.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import re
  2. from typing import List
  3. from bs4 import BeautifulSoup
  4. # ✅ 兼容相对导入和绝对导入
  5. try:
  6. from .text_processor import TextProcessor
  7. except ImportError:
  8. from text_processor import TextProcessor
  9. class ContentExtractor:
  10. """从Markdown中提取表格和段落"""
  11. def __init__(self):
  12. self.text_processor = TextProcessor()
  13. def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
  14. """从Markdown中提取表格数据"""
  15. tables = []
  16. soup = BeautifulSoup(md_content, 'html.parser')
  17. html_tables = soup.find_all('table')
  18. for table in html_tables:
  19. table_data = []
  20. rows = table.find_all('tr')
  21. for row in rows:
  22. cells = row.find_all(['td', 'th'])
  23. row_data = []
  24. for cell in cells:
  25. cell_text = self.text_processor.normalize_text(cell.get_text())
  26. if not self.text_processor.is_image_reference(cell_text):
  27. row_data.append(cell_text)
  28. else:
  29. row_data.append("[图片内容-忽略]")
  30. if row_data:
  31. table_data.append(row_data)
  32. if table_data:
  33. tables.append(table_data)
  34. return tables
  35. def extract_paragraphs(self, md_content: str) -> List[str]:
  36. """提取段落文本"""
  37. content = re.sub(r'<table[^>]*>.*?</table>', '', md_content, flags=re.DOTALL | re.IGNORECASE)
  38. content = re.sub(r'<[^>]+>', '', content)
  39. content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  40. paragraphs = []
  41. lines = content.split('\n')
  42. merged_lines = self._merge_split_paragraphs(lines)
  43. for line in merged_lines:
  44. normalized = self.text_processor.normalize_text(line)
  45. if normalized:
  46. paragraphs.append(normalized)
  47. return paragraphs
  48. def _merge_split_paragraphs(self, lines: List[str]) -> List[str]:
  49. """合并连续的非空行作为一个段落"""
  50. merged_lines = []
  51. current_paragraph = ""
  52. for line in lines:
  53. if not line:
  54. if current_paragraph:
  55. merged_lines.append(current_paragraph)
  56. current_paragraph = ""
  57. continue
  58. if self.text_processor.is_image_reference(line):
  59. continue
  60. is_title = (
  61. line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
  62. line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
  63. line.startswith('#')
  64. )
  65. if is_title:
  66. if current_paragraph:
  67. merged_lines.append(current_paragraph)
  68. current_paragraph = ""
  69. merged_lines.append(line)
  70. else:
  71. if current_paragraph and not current_paragraph.endswith((' ', '\t')):
  72. current_paragraph += line
  73. else:
  74. current_paragraph = line
  75. if current_paragraph:
  76. merged_lines.append(current_paragraph)
  77. return merged_lines