# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re def _is_sentence_dot(text, i): """ Check if the given character is a sentence ending punctuation. """ # if the character is not a period, return False if text[i] != ".": return False # previous character prev = text[i - 1] if i > 0 else "" # next character next = text[i + 1] if i + 1 < len(text) else "" # previous is digit or letter, then not sentence ending punctuation if prev.isdigit() or prev.isalpha(): return False # next is digit or letter, then not sentence ending punctuation if next.isdigit() or next.isalpha(): return False # next is a punctuation, then sentence ending punctuation if next in ("", " ", "\t", "\n", '"', "'", "”", "’", ")", "】", "」", "》"): return True return False def _find_split_pos(text, chunk_size): """ Find the position to split the text into two chunks. Args: text (str): The original text to be split. chunk_size (int): The maximum size of each chunk. Returns: int: The index where the text should be split. """ center = len(text) // 2 split_chars = ["\n", "。", ";", ";", "!", "!", "?", "?"] # Search forward for i in range(center, len(text)): if text[i] in split_chars: # Check for whitespace around the split character j = i + 1 while j < len(text) and text[j] in " \t\n": j += 1 if j < len(text) and len(text[:j]) <= chunk_size: return i, j elif text[i] == "." and _is_sentence_dot(text, i): j = i + 1 while j < len(text) and text[j] in " \t\n": j += 1 if j < len(text) and len(text[:j]) <= chunk_size: return i, j # Search backward for i in range(center, 0, -1): if text[i] in split_chars: j = i + 1 while j < len(text) and text[j] in " \t\n": j += 1 if len(text[:j]) <= chunk_size: return i, j elif text[i] == "." and _is_sentence_dot(text, i): j = i + 1 while j < len(text) and text[j] in " \t\n": j += 1 if len(text[:j]) <= chunk_size: return i, j # If no suitable position is found, split directly return min(chunk_size, len(text)), min(chunk_size, len(text)) def split_text_recursive(text, chunk_size, translate_func): """ Split the text recursively and translate each chunk. Args: text (str): The original text to be split. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ text = text.strip() if len(text) <= chunk_size: return translate_func(text) else: split_pos, end_whitespace = _find_split_pos(text, chunk_size) left = text[:split_pos] right = text[end_whitespace:] whitespace = text[split_pos:end_whitespace] if left: left_text = split_text_recursive(left, chunk_size, translate_func) if right: right_text = split_text_recursive(right, chunk_size, translate_func) return left_text + whitespace + right_text def translate_code_block(code_block, chunk_size, translate_func, results): """ Translate a code block and append the result to the results list. Args: code_block (str): The code block to be translated. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ lines = code_block.strip().split("\n") if lines[0].startswith("```") or lines[0].startswith("~~~"): header = lines[0] footer = ( lines[-1] if (lines[-1].startswith("```") or lines[-1].startswith("~~~")) else "" ) code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:]) else: header = "" footer = "" code_content = code_block translated_code_lines = split_text_recursive( code_content, chunk_size, translate_func ) # drop ``` or ~~~ filtered_code_lines = [ line for line in translated_code_lines.split("\n") if not (line.strip().startswith("```") or line.strip().startswith("~~~")) ] translated_code = "\n".join(filtered_code_lines) result = f"{header}\n{translated_code}\n{footer}" if header else translated_code results.append(result) def translate_html_block(html_block, chunk_size, translate_func, results): """ Translate a HTML block and append the result to the results list. Args: html_block (str): The HTML block to be translated. chunk_size (int): The maximum size of each chunk. translate_func (callable): A function that translates a single chunk of text. results (list): A list to store the translated chunks. Returns: None """ from bs4 import BeautifulSoup # if this is a short and simple tag, just translate it if ( html_block.count("<") < 5 and html_block.count(">") < 5 and html_block.count("<") == html_block.count(">") and len(html_block) < chunk_size ): translated = translate_func(html_block) results.append(translated) return soup = BeautifulSoup(html_block, "html.parser") # collect text nodes text_nodes = [] for node in soup.find_all(string=True, recursive=True): text = node.strip() if text: text_nodes.append(node) idx = 0 total = len(text_nodes) while idx < total: batch_nodes = [] li_texts = [] current_length = len("