import re from typing import List from bs4 import BeautifulSoup # ✅ 兼容相对导入和绝对导入 try: from .text_processor import TextProcessor except ImportError: from text_processor import TextProcessor class ContentExtractor: """从Markdown中提取表格和段落""" def __init__(self): self.text_processor = TextProcessor() def extract_table_data(self, md_content: str) -> List[List[List[str]]]: """从Markdown中提取表格数据""" tables = [] soup = BeautifulSoup(md_content, 'html.parser') html_tables = soup.find_all('table') for table in html_tables: table_data = [] rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) row_data = [] for cell in cells: cell_text = self.text_processor.normalize_text(cell.get_text()) if not self.text_processor.is_image_reference(cell_text): row_data.append(cell_text) else: row_data.append("[图片内容-忽略]") if row_data: table_data.append(row_data) if table_data: tables.append(table_data) return tables def extract_paragraphs(self, md_content: str) -> List[str]: """提取段落文本""" content = re.sub(r']*>.*?', '', md_content, flags=re.DOTALL | re.IGNORECASE) content = re.sub(r'<[^>]+>', '', content) content = re.sub(r'', '', content, flags=re.DOTALL) paragraphs = [] lines = content.split('\n') merged_lines = self._merge_split_paragraphs(lines) for line in merged_lines: normalized = self.text_processor.normalize_text(line) if normalized: paragraphs.append(normalized) return paragraphs def _merge_split_paragraphs(self, lines: List[str]) -> List[str]: """合并连续的非空行作为一个段落""" merged_lines = [] current_paragraph = "" for line in lines: if not line: if current_paragraph: merged_lines.append(current_paragraph) current_paragraph = "" continue if self.text_processor.is_image_reference(line): continue is_title = ( line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or line.startswith('#') ) if is_title: if current_paragraph: merged_lines.append(current_paragraph) current_paragraph = "" merged_lines.append(line) else: if current_paragraph and not current_paragraph.endswith((' ', '\t')): current_paragraph += line else: current_paragraph = line if current_paragraph: merged_lines.append(current_paragraph) return merged_lines