|
|
@@ -24,6 +24,10 @@ class ContentExtractor:
|
|
|
text = re.sub(r'\s*([,。:;!?、])\s*', r'\1', text)
|
|
|
return text
|
|
|
|
|
|
+ def _strip_html_comments(self, content: str) -> str:
|
|
|
+ """移除 HTML/Markdown 注释块(含多行),不参与段落提取与对比。"""
|
|
|
+ return re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
|
|
|
+
|
|
|
def _is_image_reference(self, text: str) -> bool:
|
|
|
"""判断是否为图片引用或描述"""
|
|
|
image_keywords = [
|
|
|
@@ -194,23 +198,19 @@ class ContentExtractor:
|
|
|
return merged_lines
|
|
|
|
|
|
def extract_paragraphs(self, content: str) -> List[str]:
|
|
|
- """提取段落内容"""
|
|
|
- # 移除HTML标签
|
|
|
- content_no_html = re.sub(r'<[^>]+>', '', content)
|
|
|
-
|
|
|
- # 移除bbox注释
|
|
|
- content_no_bbox = re.sub(r'<!--.*?-->', '', content_no_html)
|
|
|
-
|
|
|
- # 按换行符分割
|
|
|
+ """提取段落内容(HTML 注释、标准化说明元数据不参与对比)"""
|
|
|
+ # 必须先去掉注释:多行 <!-- ... --> 无法用 <[^>]+> 或单行 .*? 一次清干净
|
|
|
+ content_no_comments = self._strip_html_comments(content)
|
|
|
+ content_no_html = re.sub(r'<[^>]+>', '', content_no_comments)
|
|
|
+
|
|
|
paragraphs = []
|
|
|
- lines = content_no_bbox.split('\n')
|
|
|
+ lines = content_no_html.split('\n')
|
|
|
merged_lines = self.merge_split_paragraphs(lines)
|
|
|
-
|
|
|
+
|
|
|
for line in merged_lines:
|
|
|
normalized = self._normalize_text(line)
|
|
|
- if normalized:
|
|
|
- paragraphs.append(normalized)
|
|
|
- else:
|
|
|
- print(f"跳过的内容无效或图片段落: {line[0:30] if line else ''}...")
|
|
|
+ if not normalized:
|
|
|
+ continue
|
|
|
+ paragraphs.append(normalized)
|
|
|
|
|
|
return paragraphs
|