Browse Source

add trans deps

zhouchangda 4 tháng trước cách đây
mục cha
commit
e88afb4510

+ 1 - 0
.precommit/check_imports.py

@@ -83,6 +83,7 @@ MOD_TO_DEP = {
     "ujson": "ujson",
     "uvicorn": "uvicorn",
     "yarl": "yarl",
+    "bs4": "beautifulsoup4",
 }
 assert (
     set(MOD_TO_DEP.values()) == DEP_SPECS.keys()

+ 1 - 1
api_examples/pipelines/test_pp_doctranslation.py

@@ -16,7 +16,7 @@ from paddlex import create_pipeline
 
 pipeline = create_pipeline(pipeline="PP-DocTranslation")
 
-input_path = "docs/pipeline_usage/tutorials/ocr_pipelines/PP-Translation.md"
+input_path = "document_sample.pdf"
 output_path = "./output"
 
 chat_bot_config = {

+ 1 - 0
docs/installation/installation.en.md

@@ -278,6 +278,7 @@ PaddleX currently provides the following dependency groups:
 | `speech` | Basic features of speech pipeline.s |
 | `ts` | Basic features of time series pipelines. |
 | `video` | Basic features of video pipelines. |
+| `trans` | Basic features of translation pipelines. |
 | `serving` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
 | `plugins` | All plugin-provided features that support installation via dependency groups. |
 | `all` | All basic features of PaddleX, as well as all plugin-provided features installable via dependency groups. |

+ 1 - 0
docs/installation/installation.md

@@ -279,6 +279,7 @@ PaddleX 目前提供如下依赖组:
 | `speech` | 语音产线的基础功能。 |
 | `ts` | 时序产线的基础功能。 |
 | `video` | 视频产线的基础功能。 |
+| `trans` | 翻译产线的基础功能。 |
 | `serving` | 服务化部署功能。安装此依赖组等效于安装 PaddleX 服务化部署插件;也可以通过 PaddleX CLI 安装服务化部署插件。 |
 | `plugins` | 所有支持通过指定依赖组安装的插件提供的功能。 |
 | `all` | PaddleX 的所有基础功能,以及所有支持通过指定依赖组安装的插件提供的功能。 |

+ 2 - 3
paddlex/configs/pipelines/PP-DocTranslation.yaml

@@ -16,7 +16,7 @@ SubModules:
       module_name: prompt_engneering
       task_type: translate_prompt
       
-      task_description: '你是一位多语种语言翻译专家,精通多种语言的语法、词汇和文化背景。你的任务是将文本从一种语言准确地转换为另一种语言,同时保留原文的语义、风格和语调。'
+      task_description: '你是一位资深的多语种语言翻译专家,精通多种语言的语法、词汇、文化背景以及语言风格。你的任务是将文本从一种语言准确地转换为另一种语言,同时精准地保留原文的语义、风格和语调,确保翻译内容在目标语言中自然流畅且富有文化适应性。'
 
       output_format: '输出应为翻译后的文本,并与原文保持格式一致,包括标点符号和段落结构。如果原文中包含特定的格式(如表格、公式、列表等),翻译后的文本也应保持相同的格式。'
 
@@ -28,8 +28,7 @@ SubModules:
               5. 避免使用机器翻译工具的简单直译,需根据上下文进行调整和优化。
               6. 原文中可能包含的非文本元素(如HTML语法中的图片、表格、公式等)应保持不变。
               7. 原文中可能包含的代码块,如编程语言代码等,应保持代码块的完整性,不要对代码进行调整。
-              8. 对于原文中的HTML结构代码,并且HTML代码可能被分割,不要省略或修改原文中不完整的HTML代码,不要做任何调整。
-              9. 翻译完成后,应仔细校对,确保没有语法和拼写错误'
+              8. 翻译完成后,应仔细校对,确保没有语法和拼写错误。'
       few_shot_demo_text_content:
       few_shot_demo_key_value_list:
 

+ 84 - 17
paddlex/inference/pipelines/pp_doctranslation/pipeline.py

@@ -26,7 +26,7 @@ from ..base import BasePipeline
 from .result import MarkdownResult
 
 
-@pipeline_requires_extra("ie")
+@pipeline_requires_extra("trans")
 class PP_DocTranslation_Pipeline(BasePipeline):
     entities = ["PP-DocTranslation"]
 
@@ -247,6 +247,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
         return markdown_info_list
 
     def split_markdown(self, md_text, chunk_size):
+        from bs4 import BeautifulSoup
 
         if (
             not isinstance(md_text, str)
@@ -258,34 +259,100 @@ class PP_DocTranslation_Pipeline(BasePipeline):
         chunks = []
         current_chunk = []
 
-        # if md_text less than chunk_size, return the md_text
+        # 如果整体文本小于chunk_size,直接返回
         if len(md_text) < chunk_size:
-            chunks.append(md_text)
-            return chunks
+            return [md_text]
 
-        # split the md_text into paragraphs
-        paragraphs = md_text.split("\n\n")
+        # 段落分割,两个及以上换行符视为分段
+        paragraphs = re.split(r"\n{2,}", md_text)
+
+        def split_table_to_chunks(table_html):
+            # 使用 BeautifulSoup 解析表格
+            soup = BeautifulSoup(table_html, "html.parser")
+            table = soup.find("table")
+
+            if not table:
+                return [table_html]  # 如果没有找到表格,直接返回原始内容
+
+            # 提取所有<tr>行
+            trs = table.find_all("tr")
+
+            # 按行累加,确保每个chunk长度<=chunk_size,且不破坏<tr>的完整性
+            table_chunks = []
+            current_rows = []
+            current_len = len("<table></table>")  # 基础长度
+
+            for tr in trs:
+                tr_str = str(tr)
+                row_len = len(tr_str)
+                if current_rows and current_len + row_len > chunk_size:
+                    # 打包当前chunk
+                    content = "<table>" + "".join(current_rows) + "</table>"
+                    table_chunks.append(content)
+                    current_rows = []  # 重置当前行列表
+                    current_len = len("<table></table>") + row_len
+
+                current_rows.append(tr_str)
+                current_len += row_len
+
+            if current_rows:
+                content = "<table>" + "".join(current_rows) + "</table>"
+                table_chunks.append(content)
+
+            return table_chunks
+
+        # 句子分割,英文句号需区分小数点
+        sentence_pattern = re.compile(
+            r"(?<=[。!?!?])|(?<=\.)\s+(?=[A-Z])|(?<=\.)\s*$"
+        )
 
         for paragraph in paragraphs:
-            if len(paragraph) == 0:
-                # 空行直接跳过
+            paragraph = paragraph.strip()
+            if not paragraph:
                 continue
 
-            if len(paragraph) <= chunk_size:
+            # 使用 BeautifulSoup 检查是否为完整表格
+            soup = BeautifulSoup(paragraph, "html.parser")
+            table = soup.find("table")
+
+            if table:
+                table_html = str(table)
+                if len(table_html) <= chunk_size:
+                    if current_chunk:
+                        chunks.append("\n\n".join(current_chunk))
+                        current_chunk = []
+                    chunks.append(table_html)
+                else:
+                    # 表格太大,行分段
+                    if current_chunk:
+                        chunks.append("\n\n".join(current_chunk))
+                        current_chunk = []
+                    table_chunks = split_table_to_chunks(table_html)
+                    chunks.extend(table_chunks)
+                continue
+
+            # 普通文本处理
+            if sum(len(s) for s in current_chunk) + len(paragraph) <= chunk_size:
                 current_chunk.append(paragraph)
+            elif len(paragraph) <= chunk_size:
+                if current_chunk:
+                    chunks.append("\n\n".join(current_chunk))
+                current_chunk = [paragraph]
             else:
-                # if the paragraph is too long, split it into sentences
-                sentences = re.split(r"(?<=[。.!?])", paragraph)
+                # 段落太长,按句子切分
+                sentences = [
+                    s for s in sentence_pattern.split(paragraph) if s and s.strip()
+                ]
                 for sentence in sentences:
-                    if len(sentence) == 0:
+                    sentence = sentence.strip()
+                    if not sentence:
                         continue
-
                     if len(sentence) > chunk_size:
+                        print(sentence)
                         raise ValueError("A sentence exceeds the chunk size limit.")
-
-                    # if the current chunk is too long, store it and start a new one
                     if sum(len(s) for s in current_chunk) + len(sentence) > chunk_size:
-                        chunks.append("\n\n".join(current_chunk))
+                        if current_chunk:
+                            chunks.append("\n\n".join(current_chunk))
                         current_chunk = [sentence]
                     else:
                         current_chunk.append(sentence)
@@ -297,7 +364,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
         if current_chunk:
             chunks.append("\n\n".join(current_chunk))
 
-        return chunks
+        return [c for c in chunks if c.strip()]
 
     def translate(
         self,

+ 3 - 2
paddlex/inference/pipelines/pp_doctranslation/result.py

@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pathlib as Path
+from pathlib import Path
+
 from ...common.result import BaseCVResult, MarkdownMixin
 
 
@@ -21,7 +22,7 @@ class MarkdownResult(BaseCVResult, MarkdownMixin):
         """Initializes a new instance of the class with the specified data."""
         super().__init__(data)
         MarkdownMixin.__init__(self)
-        
+
     def _get_input_fn(self):
         fn = super()._get_input_fn()
         if (page_idx := self.get("page_index", None)) is not None:

+ 16 - 0
setup.py

@@ -74,6 +74,7 @@ DEP_SPECS = {
     "ujson": "",
     "uvicorn": ">= 0.16",
     "yarl": ">= 1.9",
+    "beautifulsoup4": "",
 }
 
 REQUIRED_DEPS = [
@@ -136,6 +137,21 @@ EXTRAS = {
             "shapely",
             "tokenizers",
         ],
+        "trans": [
+            "ftfy",
+            "imagesize",
+            "lxml",
+            "openai",
+            "opencv-contrib-python",
+            "openpyxl",
+            "premailer",
+            "pyclipper",
+            "pypdfium2",
+            "scikit-learn",
+            "shapely",
+            "tokenizers",
+            "beautifulsoup4",
+        ],
         "ocr": [
             "einops",
             "ftfy",