4 tháng trước cách đây · e88afb4510
--- a/.precommit/check_imports.py
+++ b/.precommit/check_imports.py
@@ -83,6 +83,7 @@ MOD_TO_DEP = {
 
				     "ujson": "ujson",
			
 
				     "uvicorn": "uvicorn",
			
 
				     "yarl": "yarl",
			
 
				+    "bs4": "beautifulsoup4",
			
 
				 }
			
 
				 assert (
			
 
				     set(MOD_TO_DEP.values()) == DEP_SPECS.keys()
			
--- a/api_examples/pipelines/test_pp_doctranslation.py
+++ b/api_examples/pipelines/test_pp_doctranslation.py
@@ -16,7 +16,7 @@ from paddlex import create_pipeline
 
				 
			
 
				 pipeline = create_pipeline(pipeline="PP-DocTranslation")
			
 
				 
			
 
				-input_path = "docs/pipeline_usage/tutorials/ocr_pipelines/PP-Translation.md"
			
 
				+input_path = "document_sample.pdf"
			
 
				 output_path = "./output"
			
 
				 
			
 
				 chat_bot_config = {
			
--- a/docs/installation/installation.en.md
+++ b/docs/installation/installation.en.md
@@ -278,6 +278,7 @@ PaddleX currently provides the following dependency groups:
 
				 | `speech` | Basic features of speech pipeline.s |
			
 
				 | `ts` | Basic features of time series pipelines. |
			
 
				 | `video` | Basic features of video pipelines. |
			
 
				+| `trans` | Basic features of translation pipelines. |
			
 
				 | `serving` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
			
 
				 | `plugins` | All plugin-provided features that support installation via dependency groups. |
			
 
				 | `all` | All basic features of PaddleX, as well as all plugin-provided features installable via dependency groups. |
			
--- a/docs/installation/installation.md
+++ b/docs/installation/installation.md
@@ -279,6 +279,7 @@ PaddleX 目前提供如下依赖组：
 
				 | `speech` | 语音产线的基础功能。 |
			
 
				 | `ts` | 时序产线的基础功能。 |
			
 
				 | `video` | 视频产线的基础功能。 |
			
 
				+| `trans` | 翻译产线的基础功能。 |
			
 
				 | `serving` | 服务化部署功能。安装此依赖组等效于安装 PaddleX 服务化部署插件；也可以通过 PaddleX CLI 安装服务化部署插件。 |
			
 
				 | `plugins` | 所有支持通过指定依赖组安装的插件提供的功能。 |
			
 
				 | `all` | PaddleX 的所有基础功能，以及所有支持通过指定依赖组安装的插件提供的功能。 |
			
--- a/paddlex/configs/pipelines/PP-DocTranslation.yaml
+++ b/paddlex/configs/pipelines/PP-DocTranslation.yaml
@@ -16,7 +16,7 @@ SubModules:
 
				       module_name: prompt_engneering
			
 
				       task_type: translate_prompt
			
 
				       
			
 
				-      task_description: '你是一位多语种语言翻译专家，精通多种语言的语法、词汇和文化背景。你的任务是将文本从一种语言准确地转换为另一种语言，同时保留原文的语义、风格和语调。'
			
 
				+      task_description: '你是一位资深的多语种语言翻译专家，精通多种语言的语法、词汇、文化背景以及语言风格。你的任务是将文本从一种语言准确地转换为另一种语言，同时精准地保留原文的语义、风格和语调，确保翻译内容在目标语言中自然流畅且富有文化适应性。'
			
 
				 
			
 
				       output_format: '输出应为翻译后的文本，并与原文保持格式一致，包括标点符号和段落结构。如果原文中包含特定的格式（如表格、公式、列表等），翻译后的文本也应保持相同的格式。'
			
 
				 
			
@@ -28,8 +28,7 @@ SubModules:
 
				               5. 避免使用机器翻译工具的简单直译，需根据上下文进行调整和优化。
			
 
				               6. 原文中可能包含的非文本元素（如HTML语法中的图片、表格、公式等）应保持不变。
			
 
				               7. 原文中可能包含的代码块，如编程语言代码等，应保持代码块的完整性，不要对代码进行调整。
			
 
				-              8. 对于原文中的HTML结构代码，并且HTML代码可能被分割，不要省略或修改原文中不完整的HTML代码，不要做任何调整。
			
 
				-              9. 翻译完成后，应仔细校对，确保没有语法和拼写错误'
			
 
				+              8. 翻译完成后，应仔细校对，确保没有语法和拼写错误。'
			
 
				       few_shot_demo_text_content:
			
 
				       few_shot_demo_key_value_list:
			
 
				 
			
--- a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
@@ -26,7 +26,7 @@ from ..base import BasePipeline
 
				 from .result import MarkdownResult
			
 
				 
			
 
				 
			
 
				-@pipeline_requires_extra("ie")
			
 
				+@pipeline_requires_extra("trans")
			
 
				 class PP_DocTranslation_Pipeline(BasePipeline):
			
 
				     entities = ["PP-DocTranslation"]
			
 
				 
			
@@ -247,6 +247,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				         return markdown_info_list
			
 
				 
			
 
				     def split_markdown(self, md_text, chunk_size):
			
 
				+        from bs4 import BeautifulSoup
			
 
				 
			
 
				         if (
			
 
				             not isinstance(md_text, str)
			
@@ -258,34 +259,100 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				         chunks = []
			
 
				         current_chunk = []
			
 
				 
			
 
				-        # if md_text less than chunk_size, return the md_text
			
 
				+        # 如果整体文本小于chunk_size，直接返回
			
 
				         if len(md_text) < chunk_size:
			
 
				-            chunks.append(md_text)
			
 
				-            return chunks
			
 
				+            return [md_text]
			
 
				 
			
 
				-        # split the md_text into paragraphs
			
 
				-        paragraphs = md_text.split("\n\n")
			
 
				+        # 段落分割，两个及以上换行符视为分段
			
 
				+        paragraphs = re.split(r"\n{2,}", md_text)
			
 
				+
			
 
				+        def split_table_to_chunks(table_html):
			
 
				+            # 使用 BeautifulSoup 解析表格
			
 
				+            soup = BeautifulSoup(table_html, "html.parser")
			
 
				+            table = soup.find("table")
			
 
				+
			
 
				+            if not table:
			
 
				+                return [table_html]  # 如果没有找到表格，直接返回原始内容
			
 
				+
			
 
				+            # 提取所有<tr>行
			
 
				+            trs = table.find_all("tr")
			
 
				+
			
 
				+            # 按行累加，确保每个chunk长度<=chunk_size，且不破坏<tr>的完整性
			
 
				+            table_chunks = []
			
 
				+            current_rows = []
			
 
				+            current_len = len("<table></table>")  # 基础长度
			
 
				+
			
 
				+            for tr in trs:
			
 
				+                tr_str = str(tr)
			
 
				+                row_len = len(tr_str)
			
 
				+                if current_rows and current_len + row_len > chunk_size:
			
 
				+                    # 打包当前chunk
			
 
				+                    content = "<table>" + "".join(current_rows) + "</table>"
			
 
				+                    table_chunks.append(content)
			
 
				+                    current_rows = []  # 重置当前行列表
			
 
				+                    current_len = len("<table></table>") + row_len
			
 
				+
			
 
				+                current_rows.append(tr_str)
			
 
				+                current_len += row_len
			
 
				+
			
 
				+            if current_rows:
			
 
				+                content = "<table>" + "".join(current_rows) + "</table>"
			
 
				+                table_chunks.append(content)
			
 
				+
			
 
				+            return table_chunks
			
 
				+
			
 
				+        # 句子分割，英文句号需区分小数点
			
 
				+        sentence_pattern = re.compile(
			
 
				+            r"(?<=[。！？!?])|(?<=\.)\s+(?=[A-Z])|(?<=\.)\s*$"
			
 
				+        )
			
 
				 
			
 
				         for paragraph in paragraphs:
			
 
				-            if len(paragraph) == 0:
			
 
				-                # 空行直接跳过
			
 
				+            paragraph = paragraph.strip()
			
 
				+            if not paragraph:
			
 
				                 continue
			
 
				 
			
 
				-            if len(paragraph) <= chunk_size:
			
 
				+            # 使用 BeautifulSoup 检查是否为完整表格
			
 
				+            soup = BeautifulSoup(paragraph, "html.parser")
			
 
				+            table = soup.find("table")
			
 
				+
			
 
				+            if table:
			
 
				+                table_html = str(table)
			
 
				+                if len(table_html) <= chunk_size:
			
 
				+                    if current_chunk:
			
 
				+                        chunks.append("\n\n".join(current_chunk))
			
 
				+                        current_chunk = []
			
 
				+                    chunks.append(table_html)
			
 
				+                else:
			
 
				+                    # 表格太大，行分段
			
 
				+                    if current_chunk:
			
 
				+                        chunks.append("\n\n".join(current_chunk))
			
 
				+                        current_chunk = []
			
 
				+                    table_chunks = split_table_to_chunks(table_html)
			
 
				+                    chunks.extend(table_chunks)
			
 
				+                continue
			
 
				+
			
 
				+            # 普通文本处理
			
 
				+            if sum(len(s) for s in current_chunk) + len(paragraph) <= chunk_size:
			
 
				                 current_chunk.append(paragraph)
			
 
				+            elif len(paragraph) <= chunk_size:
			
 
				+                if current_chunk:
			
 
				+                    chunks.append("\n\n".join(current_chunk))
			
 
				+                current_chunk = [paragraph]
			
 
				             else:
			
 
				-                # if the paragraph is too long, split it into sentences
			
 
				-                sentences = re.split(r"(?<=[。.!?])", paragraph)
			
 
				+                # 段落太长，按句子切分
			
 
				+                sentences = [
			
 
				+                    s for s in sentence_pattern.split(paragraph) if s and s.strip()
			
 
				+                ]
			
 
				                 for sentence in sentences:
			
 
				-                    if len(sentence) == 0:
			
 
				+                    sentence = sentence.strip()
			
 
				+                    if not sentence:
			
 
				                         continue
			
 
				-
			
 
				                     if len(sentence) > chunk_size:
			
 
				+                        print(sentence)
			
 
				                         raise ValueError("A sentence exceeds the chunk size limit.")
			
 
				-
			
 
				-                    # if the current chunk is too long, store it and start a new one
			
 
				                     if sum(len(s) for s in current_chunk) + len(sentence) > chunk_size:
			
 
				-                        chunks.append("\n\n".join(current_chunk))
			
 
				+                        if current_chunk:
			
 
				+                            chunks.append("\n\n".join(current_chunk))
			
 
				                         current_chunk = [sentence]
			
 
				                     else:
			
 
				                         current_chunk.append(sentence)
			
@@ -297,7 +364,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				         if current_chunk:
			
 
				             chunks.append("\n\n".join(current_chunk))
			
 
				 
			
 
				-        return chunks
			
 
				+        return [c for c in chunks if c.strip()]
			
 
				 
			
 
				     def translate(
			
 
				         self,
			
--- a/paddlex/inference/pipelines/pp_doctranslation/result.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/result.py
@@ -12,7 +12,8 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-import pathlib as Path
			
 
				+from pathlib import Path
			
 
				+
			
 
				 from ...common.result import BaseCVResult, MarkdownMixin
			
 
				 
			
 
				 
			
@@ -21,7 +22,7 @@ class MarkdownResult(BaseCVResult, MarkdownMixin):
 
				         """Initializes a new instance of the class with the specified data."""
			
 
				         super().__init__(data)
			
 
				         MarkdownMixin.__init__(self)
			
 
				-        
			
 
				+
			
 
				     def _get_input_fn(self):
			
 
				         fn = super()._get_input_fn()
			
 
				         if (page_idx := self.get("page_index", None)) is not None:
			
--- a/setup.py
+++ b/setup.py
@@ -74,6 +74,7 @@ DEP_SPECS = {
 
				     "ujson": "",
			
 
				     "uvicorn": ">= 0.16",
			
 
				     "yarl": ">= 1.9",
			
 
				+    "beautifulsoup4": "",
			
 
				 }
			
 
				 
			
 
				 REQUIRED_DEPS = [
			
@@ -136,6 +137,21 @@ EXTRAS = {
 
				             "shapely",
			
 
				             "tokenizers",
			
 
				         ],
			
 
				+        "trans": [
			
 
				+            "ftfy",
			
 
				+            "imagesize",
			
 
				+            "lxml",
			
 
				+            "openai",
			
 
				+            "opencv-contrib-python",
			
 
				+            "openpyxl",
			
 
				+            "premailer",
			
 
				+            "pyclipper",
			
 
				+            "pypdfium2",
			
 
				+            "scikit-learn",
			
 
				+            "shapely",
			
 
				+            "tokenizers",
			
 
				+            "beautifulsoup4",
			
 
				+        ],
			
 
				         "ocr": [
			
 
				             "einops",
			
 
				             "ftfy",