| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- def _is_sentence_dot(text, i):
- """
- Check if the given character is a sentence ending punctuation.
- """
- # if the character is not a period, return False
- if text[i] != ".":
- return False
- # previous character
- prev = text[i - 1] if i > 0 else ""
- # next character
- next = text[i + 1] if i + 1 < len(text) else ""
- # previous is digit or letter, then not sentence ending punctuation
- if prev.isdigit() or prev.isalpha():
- return False
- # next is digit or letter, then not sentence ending punctuation
- if next.isdigit() or next.isalpha():
- return False
- # next is a punctuation, then sentence ending punctuation
- if next in ("", " ", "\t", "\n", '"', "'", "”", "’", ")", "】", "」", "》"):
- return True
- return False
- def _find_split_pos(text, chunk_size):
- """
- Find the position to split the text into two chunks.
- Args:
- text (str): The original text to be split.
- chunk_size (int): The maximum size of each chunk.
- Returns:
- int: The index where the text should be split.
- """
- center = len(text) // 2
- split_chars = ["\n", "。", ";", ";", "!", "!", "?", "?"]
- # Search forward
- for i in range(center, len(text)):
- if text[i] in split_chars:
- # Check for whitespace around the split character
- j = i + 1
- while j < len(text) and text[j] in " \t\n":
- j += 1
- if j < len(text) and len(text[:j]) <= chunk_size:
- return i, j
- elif text[i] == "." and _is_sentence_dot(text, i):
- j = i + 1
- while j < len(text) and text[j] in " \t\n":
- j += 1
- if j < len(text) and len(text[:j]) <= chunk_size:
- return i, j
- # Search backward
- for i in range(center, 0, -1):
- if text[i] in split_chars:
- j = i + 1
- while j < len(text) and text[j] in " \t\n":
- j += 1
- if len(text[:j]) <= chunk_size:
- return i, j
- elif text[i] == "." and _is_sentence_dot(text, i):
- j = i + 1
- while j < len(text) and text[j] in " \t\n":
- j += 1
- if len(text[:j]) <= chunk_size:
- return i, j
- # If no suitable position is found, split directly
- return min(chunk_size, len(text)), min(chunk_size, len(text))
- def split_text_recursive(text, chunk_size, translate_func):
- """
- Split the text recursively and translate each chunk.
- Args:
- text (str): The original text to be split.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- text = text.strip()
- if len(text) <= chunk_size:
- return translate_func(text)
- else:
- split_pos, end_whitespace = _find_split_pos(text, chunk_size)
- left = text[:split_pos]
- right = text[end_whitespace:]
- whitespace = text[split_pos:end_whitespace]
- if left:
- left_text = split_text_recursive(left, chunk_size, translate_func)
- if right:
- right_text = split_text_recursive(right, chunk_size, translate_func)
- return left_text + whitespace + right_text
- def translate_code_block(code_block, chunk_size, translate_func, results):
- """
- Translate a code block and append the result to the results list.
- Args:
- code_block (str): The code block to be translated.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- lines = code_block.strip().split("\n")
- if lines[0].startswith("```") or lines[0].startswith("~~~"):
- header = lines[0]
- footer = (
- lines[-1]
- if (lines[-1].startswith("```") or lines[-1].startswith("~~~"))
- else ""
- )
- code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:])
- else:
- header = ""
- footer = ""
- code_content = code_block
- translated_code_lines = split_text_recursive(
- code_content, chunk_size, translate_func
- )
- # drop ``` or ~~~
- filtered_code_lines = [
- line
- for line in translated_code_lines.split("\n")
- if not (line.strip().startswith("```") or line.strip().startswith("~~~"))
- ]
- translated_code = "\n".join(filtered_code_lines)
- result = f"{header}\n{translated_code}\n{footer}" if header else translated_code
- results.append(result)
- def translate_html_block(html_block, chunk_size, translate_func, results):
- """
- Translate a HTML block and append the result to the results list.
- Args:
- html_block (str): The HTML block to be translated.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- from bs4 import BeautifulSoup
- # if this is a short and simple tag, just translate it
- if (
- html_block.count("<") < 5
- and html_block.count(">") < 5
- and html_block.count("<") == html_block.count(">")
- and len(html_block) < chunk_size
- ):
- translated = translate_func(html_block)
- results.append(translated)
- return
- soup = BeautifulSoup(html_block, "html.parser")
- # collect text nodes
- text_nodes = []
- for node in soup.find_all(string=True, recursive=True):
- text = node.strip()
- if text:
- text_nodes.append(node)
- idx = 0
- total = len(text_nodes)
- while idx < total:
- batch_nodes = []
- li_texts = []
- current_length = len("<ol></ol>")
- while idx < total:
- node_text = text_nodes[idx].strip()
- if len(node_text) > chunk_size:
- # if node_text is too long, split it
- translated_lines = []
- split_text_recursive(
- node_text, chunk_size, translate_func, translated_lines
- )
- # concatenate translated lines with \n
- text_nodes[idx].replace_with("\n".join(translated_lines))
- idx += 1
- continue
- li_str = f"<li>{node_text}</li>"
- if current_length + len(li_str) > chunk_size:
- break
- batch_nodes.append(text_nodes[idx])
- li_texts.append(li_str)
- current_length += len(li_str)
- idx += 1
- if not batch_nodes:
- # if all individual nodes are longer than chunk_size, translate it alone
- node_text = text_nodes[idx - 1].strip()
- li_str = f"<li>{node_text}</li>"
- batch_nodes = [text_nodes[idx - 1]]
- li_texts = [li_str]
- if batch_nodes:
- batch_text = "<ol>" + "".join(li_texts) + "</ol>"
- translated = translate_func(batch_text)
- trans_soup = BeautifulSoup(translated, "html.parser")
- translated_lis = trans_soup.find_all("li")
- for orig_node, li_tag in zip(batch_nodes, translated_lis):
- orig_node.replace_with(li_tag.decode_contents())
- results.append(str(soup))
- def split_original_texts(text):
- """
- Split the original text into chunks.
- """
- from bs4 import BeautifulSoup
- # find all html blocks and replace them with placeholders
- soup = BeautifulSoup(text, "html.parser")
- html_blocks = []
- html_placeholders = []
- placeholder_fmt = "<<HTML_BLOCK_{}>>"
- text_after_placeholder = ""
- index = 0
- for elem in soup.contents:
- if hasattr(elem, "name") and elem.name is not None:
- html_str = str(elem)
- placeholder = placeholder_fmt.format(index)
- html_blocks.append(html_str)
- html_placeholders.append(placeholder)
- text_after_placeholder += placeholder
- index += 1
- else:
- text_after_placeholder += str(elem)
- # split text into paragraphs
- splited_block = []
- splited_block = split_and_append_text(splited_block, text_after_placeholder)
- # replace placeholders with html blocks
- current_index = 0
- for idx, block in enumerate(splited_block):
- _, content = block
- while (
- current_index < len(html_placeholders)
- and html_placeholders[current_index] in content
- ):
- content = content.replace(
- html_placeholders[current_index], html_blocks[current_index]
- )
- current_index += 1
- splited_block[idx] = ("html", content)
- return splited_block
- def split_and_append_text(result, text_content):
- """
- Split the text and append the result to the result list.
- Args:
- result (list): The current result list.
- text_content (str): The text content to be processed.
- Returns:
- list: The updated result list after processing the text content.
- """
- if text_content.strip():
- # match all code block interval
- code_pattern = re.compile(r"(```.*?\n.*?```|~~~.*?\n.*?~~~)", re.DOTALL)
- last_pos = 0
- for m in code_pattern.finditer(text_content):
- # process text before code block
- if m.start() > last_pos:
- non_code = text_content[last_pos : m.start()]
- paragraphs = re.split(r"\n{2,}", non_code)
- for p in paragraphs:
- if p.strip():
- result.append(("text", p.strip()))
- # process code block
- result.append(("code", m.group()))
- last_pos = m.end()
- # process remaining text
- if last_pos < len(text_content):
- non_code = text_content[last_pos:]
- paragraphs = re.split(r"\n{2,}", non_code)
- for p in paragraphs:
- if p.strip():
- result.append(("text", p.strip()))
- return result
|