| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- import re
- from typing import List
- from bs4 import BeautifulSoup
- # ✅ 兼容相对导入和绝对导入
- try:
- from .text_processor import TextProcessor
- except ImportError:
- from text_processor import TextProcessor
- class ContentExtractor:
- """从Markdown中提取表格和段落"""
-
- def __init__(self):
- self.text_processor = TextProcessor()
-
- def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
- """从Markdown中提取表格数据"""
- tables = []
-
- soup = BeautifulSoup(md_content, 'html.parser')
- html_tables = soup.find_all('table')
-
- for table in html_tables:
- table_data = []
- rows = table.find_all('tr')
-
- for row in rows:
- cells = row.find_all(['td', 'th'])
- row_data = []
- for cell in cells:
- cell_text = self.text_processor.normalize_text(cell.get_text())
- if not self.text_processor.is_image_reference(cell_text):
- row_data.append(cell_text)
- else:
- row_data.append("[图片内容-忽略]")
-
- if row_data:
- table_data.append(row_data)
-
- if table_data:
- tables.append(table_data)
-
- return tables
-
- def extract_paragraphs(self, md_content: str) -> List[str]:
- """提取段落文本"""
- content = re.sub(r'<table[^>]*>.*?</table>', '', md_content, flags=re.DOTALL | re.IGNORECASE)
- content = re.sub(r'<[^>]+>', '', content)
- content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
-
- paragraphs = []
- lines = content.split('\n')
- merged_lines = self._merge_split_paragraphs(lines)
-
- for line in merged_lines:
- normalized = self.text_processor.normalize_text(line)
- if normalized:
- paragraphs.append(normalized)
-
- return paragraphs
-
- def _merge_split_paragraphs(self, lines: List[str]) -> List[str]:
- """合并连续的非空行作为一个段落"""
- merged_lines = []
- current_paragraph = ""
-
- for line in lines:
- if not line:
- if current_paragraph:
- merged_lines.append(current_paragraph)
- current_paragraph = ""
- continue
-
- if self.text_processor.is_image_reference(line):
- continue
- is_title = (
- line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
- line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
- line.startswith('#')
- )
-
- if is_title:
- if current_paragraph:
- merged_lines.append(current_paragraph)
- current_paragraph = ""
- merged_lines.append(line)
- else:
- if current_paragraph and not current_paragraph.endswith((' ', '\t')):
- current_paragraph += line
- else:
- current_paragraph = line
-
- if current_paragraph:
- merged_lines.append(current_paragraph)
-
- return merged_lines
|