zhengchun
/
ocr_verify


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
							import re
from typing import List
from bs4 import BeautifulSoup
# ✅ 兼容相对导入和绝对导入
try:
    from .text_processor import TextProcessor
except ImportError:
    from text_processor import TextProcessor


class ContentExtractor:
    """从Markdown中提取表格和段落"""
    
    def __init__(self):
        self.text_processor = TextProcessor()
    
    def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
        """从Markdown中提取表格数据"""
        tables = []
        
        soup = BeautifulSoup(md_content, 'html.parser')
        html_tables = soup.find_all('table')
        
        for table in html_tables:
            table_data = []
            rows = table.find_all('tr')
            
            for row in rows:
                cells = row.find_all(['td', 'th'])
                row_data = []
                for cell in cells:
                    cell_text = self.text_processor.normalize_text(cell.get_text())
                    if not self.text_processor.is_image_reference(cell_text):
                        row_data.append(cell_text)
                    else:
                        row_data.append("[图片内容-忽略]")
                        
                if row_data:
                    table_data.append(row_data)
            
            if table_data:
                tables.append(table_data)
        
        return tables
    
    def extract_paragraphs(self, md_content: str) -> List[str]:
        """提取段落文本"""
        content = re.sub(r'<table[^>]*>.*?</table>', '', md_content, flags=re.DOTALL | re.IGNORECASE)
        content = re.sub(r'<[^>]+>', '', content)
        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
        
        paragraphs = []
        lines = content.split('\n')
        merged_lines = self._merge_split_paragraphs(lines)
        
        for line in merged_lines:
            normalized = self.text_processor.normalize_text(line)
            if normalized:
                paragraphs.append(normalized)
        
        return paragraphs
    
    def _merge_split_paragraphs(self, lines: List[str]) -> List[str]:
        """合并连续的非空行作为一个段落"""
        merged_lines = []
        current_paragraph = ""
        
        for line in lines:
            if not line:
                if current_paragraph:
                    merged_lines.append(current_paragraph)
                    current_paragraph = ""
                continue
            
            if self.text_processor.is_image_reference(line):
                continue

            is_title = (
                line.startswith(('一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、')) or
                line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')) or
                line.startswith('#')
            )
            
            if is_title:
                if current_paragraph:
                    merged_lines.append(current_paragraph)
                    current_paragraph = ""
                merged_lines.append(line)
            else:
                if current_paragraph and not current_paragraph.endswith((' ', '\t')):
                    current_paragraph += line
                else:
                    current_paragraph = line
        
        if current_paragraph:
            merged_lines.append(current_paragraph)
        
        return merged_lines