# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re def _find_split_pos(text, chunk_size): """ Find the position to split the text into two chunks. Args: text (str): The original text to be split. chunk_size (int): The maximum size of each chunk. Returns: int: The index where the text should be split. """ center = len(text) // 2 # Search forward for i in range(center, len(text)): if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]: if i + 1 < len(text) and len(text[: i + 1]) <= chunk_size: return i + 1 # Search backward for i in range(center, 0, -1): if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]: if len(text[: i + 1]) <= chunk_size: return i + 1 # If no suitable position is found, split directly return min(chunk_size, len(text)) def split_text_recursive(text, chunk_size, translate_func, results): """ Split the text recursively and translate each chunk. Args: text (str): The original text to be split. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ text = text.strip() if len(text) <= chunk_size: results.append(translate_func(text)) else: split_pos = _find_split_pos(text, chunk_size) left = text[:split_pos].strip() right = text[split_pos:].strip() if left: split_text_recursive(left, chunk_size, translate_func, results) if right: split_text_recursive(right, chunk_size, translate_func, results) def translate_code_block(code_block, chunk_size, translate_func, results): """ Translate a code block and append the result to the results list. Args: code_block (str): The code block to be translated. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ lines = code_block.strip().split("\n") if lines[0].startswith("```") or lines[0].startswith("~~~"): header = lines[0] footer = ( lines[-1] if (lines[-1].startswith("```") or lines[-1].startswith("~~~")) else "" ) code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:]) else: header = "" footer = "" code_content = code_block translated_code_lines = [] split_text_recursive( code_content, chunk_size, translate_func, translated_code_lines ) # drop ``` or ~~~ filtered_code_lines = [ line for line in translated_code_lines if not (line.strip().startswith("```") or line.strip().startswith("~~~")) ] translated_code = "\n".join(filtered_code_lines) result = f"{header}\n{translated_code}\n{footer}" if header else translated_code results.append(result) def translate_html_block(html_block, chunk_size, translate_func, results): """ Translate a HTML block and append the result to the results list. Args: html_block (str): The HTML block to be translated. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_block, "html.parser") # collect text nodes text_nodes = [] for node in soup.find_all(string=True, recursive=True): text = node.strip() if text: text_nodes.append(node) idx = 0 total = len(text_nodes) while idx < total: batch_nodes = [] li_texts = [] current_length = len("
    ") while idx < total: node_text = text_nodes[idx].strip() if len(node_text) > chunk_size: # if node_text is too long, split it translated_lines = [] split_text_recursive( node_text, chunk_size, translate_func, translated_lines ) # concatenate translated lines with \n text_nodes[idx].replace_with("\n".join(translated_lines)) idx += 1 continue li_str = f"
  1. {node_text}
  2. " if current_length + len(li_str) > chunk_size: break batch_nodes.append(text_nodes[idx]) li_texts.append(li_str) current_length += len(li_str) idx += 1 if not batch_nodes: # if all individual nodes are longer than chunk_size, translate it alone node_text = text_nodes[idx - 1].strip() li_str = f"
  3. {node_text}
  4. " batch_nodes = [text_nodes[idx - 1]] li_texts = [li_str] if batch_nodes: batch_text = "
      " + "".join(li_texts) + "
    " translated = translate_func(batch_text) trans_soup = BeautifulSoup(translated, "html.parser") translated_lis = trans_soup.find_all("li") for orig_node, li_tag in zip(batch_nodes, translated_lis): orig_node.replace_with(li_tag.decode_contents()) results.append(str(soup)) def split_original_texts(text): """ Split the original text into chunks. Args: text (str): The original text to be split. Returns: list: A list of strings representing the chunks of the original text. """ from bs4 import BeautifulSoup soup = BeautifulSoup(text, "html.parser") result = [] last_position = 0 contents = soup.contents i = 0 while i < len(contents): element = contents[i] str_element = str(element) if len(str_element) == 0: i += 1 continue # find element in original text start = text.find(str_element, last_position) if start != -1: end = start + len(str_element) element_str = text[start:end] else: # if element is not a tag, try to find it in original text if hasattr(element, "name") and element.name is not None: tag = element.name pat = r"<{tag}.*?>.*?".format(tag=tag) re_html = re.compile(pat, re.DOTALL) match = re_html.search(text, last_position) if match: start = match.start() end = match.end() element_str = text[start:end] else: element_str = str_element start = -1 end = -1 else: element_str = str_element start = -1 end = -1 true_start = True if start > 0 and text[start - 1] != "\n": true_start = False # process previous text if start != -1 and last_position < start: text_content = text[last_position:start] result = split_and_append_text(result, text_content) if hasattr(element, "name") and element.name is not None: if ( end < len(text) and end >= 0 and (text[end] not in ["\n", " "] or element_str.endswith("\n")) ): next_block_pos = text.find("\n\n", end) if next_block_pos == -1: mix_region_end = len(text) else: mix_region_end = next_block_pos + 2 j = i + 1 while j < len(contents): next_element_str = str(contents[j]) next_start = text.find(next_element_str, end) if next_start == -1 or next_start >= mix_region_end: break j += 1 if true_start: # merge text and html result.append( ("text_with_html", text[start:mix_region_end].rstrip("\n")) ) else: _, last_content = result[-1] result.pop() result.append( ( "text_with_html", last_content + text[start:mix_region_end].rstrip("\n"), ) ) last_position = mix_region_end i = j else: # pure HTML block if true_start: result.append(("html", element_str)) else: _, last_content = result[-1] result.pop() result.append(("html", last_content + element_str)) last_position = end i += 1 else: # normal text result = split_and_append_text(result, element_str) last_position = end if end != -1 else last_position + len(element_str) i += 1 # process remaining text if last_position < len(text): text_content = text[last_position:] result = split_and_append_text(result, text_content) return result def split_and_append_text(result, text_content): """ Split the text and append the result to the result list. Args: result (list): The current result list. text_content (str): The text content to be processed. Returns: list: The updated result list after processing the text content. """ if text_content.strip(): # match all code block interval code_pattern = re.compile(r"(```.*?\n.*?```|~~~.*?\n.*?~~~)", re.DOTALL) last_pos = 0 for m in code_pattern.finditer(text_content): # process text before code block if m.start() > last_pos: non_code = text_content[last_pos : m.start()] paragraphs = re.split(r"\n{2,}", non_code) for p in paragraphs: if p.strip(): result.append(("text", p.strip())) # process code block result.append(("code", m.group())) last_pos = m.end() # process remaining text if last_pos < len(text_content): non_code = text_content[last_pos:] paragraphs = re.split(r"\n{2,}", non_code) for p in paragraphs: if p.strip(): result.append(("text", p.strip())) return result