import re
from typing import List
from bs4 import BeautifulSoup
# ✅ 兼容相对导入和绝对导入
try:
from .text_processor import TextProcessor
except ImportError:
from text_processor import TextProcessor
class ContentExtractor:
"""从Markdown中提取表格和段落"""
def __init__(self):
self.text_processor = TextProcessor()
def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
"""从Markdown中提取表格数据"""
tables = []
soup = BeautifulSoup(md_content, 'html.parser')
html_tables = soup.find_all('table')
for table in html_tables:
table_data = []
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
row_data = []
for cell in cells:
cell_text = self.text_processor.normalize_text(cell.get_text())
if not self.text_processor.is_image_reference(cell_text):
row_data.append(cell_text)
else:
row_data.append("[图片内容-忽略]")
if row_data:
table_data.append(row_data)
if table_data:
tables.append(table_data)
return tables
def extract_paragraphs(self, md_content: str) -> List[str]:
"""提取段落文本"""
content = re.sub(r'
', '', md_content, flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'<[^>]+>', '', content)
content = re.sub(r'', '', content, flags=re.DOTALL)
paragraphs = []
lines = content.split('\n')
merged_lines = self._merge_split_paragraphs(lines)
for line in merged_lines:
normalized = self.text_processor.normalize_text(line)
if normalized:
paragraphs.append(normalized)
return paragraphs
def _merge_split_paragraphs(self, lines: List[str]) -> List[str]:
"""合并连续的非空行作为一个段落"""
merged_lines = []
current_paragraph = ""
for line in lines:
if not line:
if current_paragraph:
merged_lines.append(current_paragraph)
current_paragraph = ""
continue
if self.text_processor.is_image_reference(line):
continue
is_title = (
line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
line.startswith('#')
)
if is_title:
if current_paragraph:
merged_lines.append(current_paragraph)
current_paragraph = ""
merged_lines.append(line)
else:
if current_paragraph and not current_paragraph.endswith((' ', '\t')):
current_paragraph += line
else:
current_paragraph = line
if current_paragraph:
merged_lines.append(current_paragraph)
return merged_lines