4 months ago · b102f238dd
--- a/api_examples/pipelines/test_pp_doctranslation.py
+++ b/api_examples/pipelines/test_pp_doctranslation.py
@@ -56,7 +56,7 @@ else:
 
				 tgt_md_info_list = pipeline.translate(
			
 
				     ori_md_info_list=ori_md_info_list,
			
 
				     target_language="en",
			
 
				-    chunk_size=5000,
			
 
				+    chunk_size=3000,
			
 
				     chat_bot_config=chat_bot_config,
			
 
				 )
			
 
				 for tgt_md_info in tgt_md_info_list:
			
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-DocTranslation.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-DocTranslation.md
@@ -7,6 +7,8 @@ comments: true
 
				 ## 1. PP-DocTranslation产线介绍
			
 
				 通用文档翻译产线（PP-DocTranslation）是飞桨提供的文档智能翻译解决方案，融合了先进的通用版面解析技术与大语言模型（LLM）能力，为您提供高效的文档智能翻译服务。该解决方案能够精准识别并提取文档中的各类元素，包括文本块、标题、段落、图片、表格及其他复杂版面结构，并在此基础之上实现高质量的多语种互译。PP-DocTranslation 支持多种主流语言间的相互翻译，尤其擅长处理排版复杂、上下文依赖性强的文档场景，力求输出精准自然、流畅专业的翻译结果。本产线同时提供了灵活的服务化部署方式，支持在多种硬件上使用多种编程语言调用。不仅如此，本产线也提供了二次开发的能力，您可以基于本产线在您自己的数据集上训练调优，训练后的模型也可以无缝集成。
			
 
				 
			
 
				+<img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/main/images/pipelines/doc_translation/pp_doctranslation.png">
			
 
				+
			
 
				 
			
 
				 <b>通用文档翻译产线中使用了通用版面解析v3子产线，因此具有通用版面解析v3产线的所有功能，更多关于通用版面解析v3产线的功能介绍和使用细节，可以点击 [通用版面解析v3产线文档](./PP-StructureV3.md) 页面查看</b>。
			
 
				 
			
@@ -1441,6 +1443,13 @@ for tgt_md_info in tgt_md_info_list:
 
				 <td><code>None</code></td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				+<td><code>llm_request_interval</code></td>
			
 
				+<td>向大语言模型发送请求的时间间隔，单位为秒。该参数可用于防止过于频繁地调用大语言模型。</td>
			
 
				+<td><code>float</code></td>
			
 
				+<td>大于等于0的浮点数</td>
			
 
				+<td><code>0</code></td>
			
 
				+</tr>
			
 
				+<tr>
			
 
				 <td><code>chat_bot_config</code></td>
			
 
				 <td>大语言模型配置</td>
			
 
				 <td><code>dict|None</code></td>
			
@@ -1452,13 +1461,6 @@ for tgt_md_info in tgt_md_info_list:
 
				 </td>
			
 
				 <td><code>None</code></td>
			
 
				 </tr>
			
 
				-<tr>
			
 
				-<td><code>llm_request_interval</code></td>
			
 
				-<td>向大语言模型发送请求的时间间隔，单位为秒。该参数可用于防止过于频繁地调用大语言模型。</td>
			
 
				-<td><code>float</code></td>
			
 
				-<td>大于等于0的浮点数</td>
			
 
				-<td><code>0</code></td>
			
 
				-</tr>
			
 
				 </tbody>
			
 
				 </table>
			
 
				 
			
--- a/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
+++ b/paddlex/inference/pipelines/components/prompt_engineering/generate_translate_prompt.py
@@ -154,9 +154,17 @@ class GenerateTranslatePrompt(BaseGeneratePrompt):
 
				         if few_shot_demo_text_content is None:
			
 
				             few_shot_demo_text_content = self.few_shot_demo_text_content
			
 
				 
			
 
				+        if few_shot_demo_text_content:
			
 
				+            few_shot_demo_text_content = (
			
 
				+                f"这里是一些示例：\n{few_shot_demo_text_content}\n"
			
 
				+            )
			
 
				+
			
 
				         if few_shot_demo_key_value_list is None:
			
 
				             few_shot_demo_key_value_list = self.few_shot_demo_key_value_list
			
 
				 
			
 
				+        if few_shot_demo_key_value_list:
			
 
				+            few_shot_demo_key_value_list = f"这里是一些专业术语对照表,对照表中单词要参考对照表翻译：\n{few_shot_demo_key_value_list}\n"
			
 
				+
			
 
				         prompt = f"""{task_description}{rules_str}{output_format}{few_shot_demo_text_content}{few_shot_demo_key_value_list}"""
			
 
				 
			
 
				         language_name = language_map.get(language, language)
			
--- a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
@@ -13,6 +13,7 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import re
			
 
				+from time import sleep
			
 
				 from typing import Any, Dict, List, Optional, Tuple, Union
			
 
				 
			
 
				 import numpy as np
			
@@ -310,7 +311,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				                 translate_code_block(
			
 
				                     block_content, chunk_size, translate_func, translation_results
			
 
				                 )
			
 
				-            elif len(block_content) < chunk_size:
			
 
				+            elif len(block_content) < chunk_size and block_type == "text":
			
 
				                 if len(chunk) + len(block_content) < chunk_size:
			
 
				                     chunk += "\n\n" + block_content
			
 
				                 else:
			
@@ -343,14 +344,14 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				         self,
			
 
				         ori_md_info_list: List[Dict],
			
 
				         target_language: str = "zh",
			
 
				-        chunk_size: int = 5000,
			
 
				+        chunk_size: int = 3000,
			
 
				         task_description: str = None,
			
 
				         output_format: str = None,
			
 
				         rules_str: str = None,
			
 
				         few_shot_demo_text_content: str = None,
			
 
				         few_shot_demo_key_value_list: str = None,
			
 
				-        chat_bot_config=None,
			
 
				-        llm_request_interval: float = 0,
			
 
				+        llm_request_interval: float = 0.0,
			
 
				+        chat_bot_config: Dict = None,
			
 
				         **kwargs,
			
 
				     ):
			
 
				         """
			
@@ -365,7 +366,8 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				             rules_str (str, optional): Rules or guidelines for the translation model to follow. Defaults to None.
			
 
				             few_shot_demo_text_content (str, optional): Demo text content for the translation model. Defaults to None.
			
 
				             few_shot_demo_key_value_list (str, optional): Demo text key-value list for the translation model. Defaults to None.
			
 
				-            chat_bot_config (Any, optional): Configuration for the chat bot used in the translation process. Defaults to None.
			
 
				+            llm_request_interval (float, optional): The interval in seconds between each request to the LLM. Defaults to 0.0.
			
 
				+            chat_bot_config (Dict, optional): Configuration for the chat bot used in the translation process. Defaults to None.
			
 
				             **kwargs: Additional keyword arguments passed to the translation model.
			
 
				 
			
 
				         Yields:
			
@@ -391,6 +393,9 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				             # for multi page pdf
			
 
				             ori_md_info_list = [self.concatenate_markdown_pages(ori_md_info_list)]
			
 
				 
			
 
				+        if not isinstance(llm_request_interval, float):
			
 
				+            llm_request_interval = float(llm_request_interval)
			
 
				+
			
 
				         def translate_func(text):
			
 
				             """
			
 
				             Translate the given text using the configured translation model.
			
@@ -401,6 +406,7 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				             Returns:
			
 
				                 str: The translated text in the target language.
			
 
				             """
			
 
				+            sleep(llm_request_interval)
			
 
				             prompt = self.translate_pe.generate_prompt(
			
 
				                 original_text=text,
			
 
				                 language=target_language,
			
@@ -415,6 +421,24 @@ class PP_DocTranslation_Pipeline(BasePipeline):
 
				                 raise Exception("The call to the large model failed.")
			
 
				             return translate
			
 
				 
			
 
				+        base_prompt_content = self.translate_pe.generate_prompt(
			
 
				+            original_text="",
			
 
				+            language=target_language,
			
 
				+            task_description=task_description,
			
 
				+            output_format=output_format,
			
 
				+            rules_str=rules_str,
			
 
				+            few_shot_demo_text_content=few_shot_demo_text_content,
			
 
				+            few_shot_demo_key_value_list=few_shot_demo_key_value_list,
			
 
				+        )
			
 
				+        base_prompt_length = len(base_prompt_content)
			
 
				+
			
 
				+        if chunk_size > base_prompt_length:
			
 
				+            chunk_size = chunk_size - base_prompt_length
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                f"Chunk size should be greater than the base prompt length ({base_prompt_length}), but got {chunk_size}."
			
 
				+            )
			
 
				+
			
 
				         for ori_md in ori_md_info_list:
			
 
				 
			
 
				             original_texts = ori_md["markdown_texts"]
			
--- a/paddlex/inference/pipelines/pp_doctranslation/utils.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/utils.py
@@ -181,118 +181,47 @@ def translate_html_block(html_block, chunk_size, translate_func, results):
 
				 def split_original_texts(text):
			
 
				     """
			
 
				     Split the original text into chunks.
			
 
				-
			
 
				-    Args:
			
 
				-        text (str): The original text to be split.
			
 
				-
			
 
				-    Returns:
			
 
				-        list: A list of strings representing the chunks of the original text.
			
 
				     """
			
 
				     from bs4 import BeautifulSoup
			
 
				 
			
 
				+    # find all html blocks and replace them with placeholders
			
 
				     soup = BeautifulSoup(text, "html.parser")
			
 
				-    result = []
			
 
				-    last_position = 0
			
 
				-    contents = soup.contents
			
 
				-    i = 0
			
 
				-    while i < len(contents):
			
 
				-        element = contents[i]
			
 
				-        str_element = str(element)
			
 
				-        if len(str_element) == 0:
			
 
				-            i += 1
			
 
				-            continue
			
 
				-
			
 
				-        # find element in original text
			
 
				-        start = text.find(str_element, last_position)
			
 
				-        if start != -1:
			
 
				-            end = start + len(str_element)
			
 
				-            element_str = text[start:end]
			
 
				+    html_blocks = []
			
 
				+    html_placeholders = []
			
 
				+    placeholder_fmt = "<<HTML_BLOCK_{}>>"
			
 
				+    text_after_placeholder = ""
			
 
				+
			
 
				+    index = 0
			
 
				+    for elem in soup.contents:
			
 
				+        if hasattr(elem, "name") and elem.name is not None:
			
 
				+            html_str = str(elem)
			
 
				+            placeholder = placeholder_fmt.format(index)
			
 
				+            html_blocks.append(html_str)
			
 
				+            html_placeholders.append(placeholder)
			
 
				+            text_after_placeholder += placeholder
			
 
				+            index += 1
			
 
				         else:
			
 
				-            # if element is not a tag, try to find it in original text
			
 
				-            if hasattr(element, "name") and element.name is not None:
			
 
				-                tag = element.name
			
 
				-                pat = r"<{tag}.*?>.*?</{tag}>".format(tag=tag)
			
 
				-                re_html = re.compile(pat, re.DOTALL)
			
 
				-                match = re_html.search(text, last_position)
			
 
				-                if match:
			
 
				-                    start = match.start()
			
 
				-                    end = match.end()
			
 
				-                    element_str = text[start:end]
			
 
				-                else:
			
 
				-                    element_str = str_element
			
 
				-                    start = -1
			
 
				-                    end = -1
			
 
				-            else:
			
 
				-                element_str = str_element
			
 
				-                start = -1
			
 
				-                end = -1
			
 
				-
			
 
				-        true_start = True
			
 
				-        if start > 0 and text[start - 1] != "\n":
			
 
				-            true_start = False
			
 
				-
			
 
				-        # process previous text
			
 
				-        if start != -1 and last_position < start:
			
 
				-            text_content = text[last_position:start]
			
 
				-            result = split_and_append_text(result, text_content)
			
 
				-
			
 
				-        if hasattr(element, "name") and element.name is not None:
			
 
				-            if (
			
 
				-                end < len(text)
			
 
				-                and end >= 0
			
 
				-                and (text[end] not in ["\n", " "] or element_str.endswith("\n"))
			
 
				-            ):
			
 
				-                next_block_pos = text.find("\n\n", end)
			
 
				-                if next_block_pos == -1:
			
 
				-                    mix_region_end = len(text)
			
 
				-                else:
			
 
				-                    mix_region_end = next_block_pos + 2
			
 
				-
			
 
				-                j = i + 1
			
 
				-                while j < len(contents):
			
 
				-                    next_element_str = str(contents[j])
			
 
				-                    next_start = text.find(next_element_str, end)
			
 
				-                    if next_start == -1 or next_start >= mix_region_end:
			
 
				-                        break
			
 
				-                    j += 1
			
 
				-                if true_start:
			
 
				-                    # merge text and html
			
 
				-                    result.append(
			
 
				-                        ("text_with_html", text[start:mix_region_end].rstrip("\n"))
			
 
				-                    )
			
 
				-                else:
			
 
				-                    _, last_content = result[-1]
			
 
				-                    result.pop()
			
 
				-                    result.append(
			
 
				-                        (
			
 
				-                            "text_with_html",
			
 
				-                            last_content + text[start:mix_region_end].rstrip("\n"),
			
 
				-                        )
			
 
				-                    )
			
 
				-                last_position = mix_region_end
			
 
				-                i = j
			
 
				-            else:
			
 
				-                # pure HTML block
			
 
				-                if true_start:
			
 
				-                    result.append(("html", element_str))
			
 
				-                else:
			
 
				-                    _, last_content = result[-1]
			
 
				-                    result.pop()
			
 
				-                    result.append(("html", last_content + element_str))
			
 
				-                last_position = end
			
 
				-                i += 1
			
 
				-        else:
			
 
				-            # normal text
			
 
				-            result = split_and_append_text(result, element_str)
			
 
				-            last_position = end if end != -1 else last_position + len(element_str)
			
 
				-            i += 1
			
 
				-
			
 
				-    # process remaining text
			
 
				-    if last_position < len(text):
			
 
				-        text_content = text[last_position:]
			
 
				-        result = split_and_append_text(result, text_content)
			
 
				-
			
 
				-    return result
			
 
				+            text_after_placeholder += str(elem)
			
 
				+
			
 
				+    # split text into paragraphs
			
 
				+    splited_block = []
			
 
				+    splited_block = split_and_append_text(splited_block, text_after_placeholder)
			
 
				+
			
 
				+    # replace placeholders with html blocks
			
 
				+    current_index = 0
			
 
				+    for idx, block in enumerate(splited_block):
			
 
				+        _, content = block
			
 
				+        while (
			
 
				+            current_index < len(html_placeholders)
			
 
				+            and html_placeholders[current_index] in content
			
 
				+        ):
			
 
				+            content = content.replace(
			
 
				+                html_placeholders[current_index], html_blocks[current_index]
			
 
				+            )
			
 
				+            current_index += 1
			
 
				+            splited_block[idx] = ("html", content)
			
 
				+
			
 
				+    return splited_block
			
 
				 
			
 
				 
			
 
				 def split_and_append_text(result, text_content):