| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- def _find_split_pos(text, chunk_size):
- """
- Find the position to split the text into two chunks.
- Args:
- text (str): The original text to be split.
- chunk_size (int): The maximum size of each chunk.
- Returns:
- int: The index where the text should be split.
- """
- center = len(text) // 2
- # Search forward
- for i in range(center, len(text)):
- if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
- if i + 1 < len(text) and len(text[: i + 1]) <= chunk_size:
- return i + 1
- # Search backward
- for i in range(center, 0, -1):
- if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
- if len(text[: i + 1]) <= chunk_size:
- return i + 1
- # If no suitable position is found, split directly
- return min(chunk_size, len(text))
- def split_text_recursive(text, chunk_size, translate_func, results):
- """
- Split the text recursively and translate each chunk.
- Args:
- text (str): The original text to be split.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- text = text.strip()
- if len(text) <= chunk_size:
- results.append(translate_func(text))
- else:
- split_pos = _find_split_pos(text, chunk_size)
- left = text[:split_pos].strip()
- right = text[split_pos:].strip()
- if left:
- split_text_recursive(left, chunk_size, translate_func, results)
- if right:
- split_text_recursive(right, chunk_size, translate_func, results)
- def translate_code_block(code_block, chunk_size, translate_func, results):
- """
- Translate a code block and append the result to the results list.
- Args:
- code_block (str): The code block to be translated.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- lines = code_block.strip().split("\n")
- if lines[0].startswith("```") or lines[0].startswith("~~~"):
- header = lines[0]
- footer = (
- lines[-1]
- if (lines[-1].startswith("```") or lines[-1].startswith("~~~"))
- else ""
- )
- code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:])
- else:
- header = ""
- footer = ""
- code_content = code_block
- translated_code_lines = []
- split_text_recursive(
- code_content, chunk_size, translate_func, translated_code_lines
- )
- # drop ``` or ~~~
- filtered_code_lines = [
- line
- for line in translated_code_lines
- if not (line.strip().startswith("```") or line.strip().startswith("~~~"))
- ]
- translated_code = "\n".join(filtered_code_lines)
- result = f"{header}\n{translated_code}\n{footer}" if header else translated_code
- results.append(result)
- def translate_html_block(html_block, chunk_size, translate_func, results):
- """
- Translate a HTML block and append the result to the results list.
- Args:
- html_block (str): The HTML block to be translated.
- chunk_size (int): The maximum size of each chunk.
- translate_func (callable): A function that translates a single chunk of text.
- results (list): A list to store the translated chunks.
- Returns:
- None
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html_block, "html.parser")
- # collect text nodes
- text_nodes = []
- for node in soup.find_all(string=True, recursive=True):
- text = node.strip()
- if text:
- text_nodes.append(node)
- idx = 0
- total = len(text_nodes)
- while idx < total:
- batch_nodes = []
- li_texts = []
- current_length = len("<ol></ol>")
- while idx < total:
- node_text = text_nodes[idx].strip()
- if len(node_text) > chunk_size:
- # if node_text is too long, split it
- translated_lines = []
- split_text_recursive(
- node_text, chunk_size, translate_func, translated_lines
- )
- # concatenate translated lines with \n
- text_nodes[idx].replace_with("\n".join(translated_lines))
- idx += 1
- continue
- li_str = f"<li>{node_text}</li>"
- if current_length + len(li_str) > chunk_size:
- break
- batch_nodes.append(text_nodes[idx])
- li_texts.append(li_str)
- current_length += len(li_str)
- idx += 1
- if not batch_nodes:
- # if all individual nodes are longer than chunk_size, translate it alone
- node_text = text_nodes[idx - 1].strip()
- li_str = f"<li>{node_text}</li>"
- batch_nodes = [text_nodes[idx - 1]]
- li_texts = [li_str]
- if batch_nodes:
- batch_text = "<ol>" + "".join(li_texts) + "</ol>"
- translated = translate_func(batch_text)
- trans_soup = BeautifulSoup(translated, "html.parser")
- translated_lis = trans_soup.find_all("li")
- for orig_node, li_tag in zip(batch_nodes, translated_lis):
- orig_node.replace_with(li_tag.decode_contents())
- results.append(str(soup))
- def split_original_texts(text):
- """
- Split the original text into chunks.
- Args:
- text (str): The original text to be split.
- Returns:
- list: A list of strings representing the chunks of the original text.
- """
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(text, "html.parser")
- result = []
- last_position = 0
- contents = soup.contents
- i = 0
- while i < len(contents):
- element = contents[i]
- str_element = str(element)
- if len(str_element) == 0:
- i += 1
- continue
- # find element in original text
- start = text.find(str_element, last_position)
- if start != -1:
- end = start + len(str_element)
- element_str = text[start:end]
- else:
- # if element is not a tag, try to find it in original text
- if hasattr(element, "name") and element.name is not None:
- tag = element.name
- pat = r"<{tag}.*?>.*?</{tag}>".format(tag=tag)
- re_html = re.compile(pat, re.DOTALL)
- match = re_html.search(text, last_position)
- if match:
- start = match.start()
- end = match.end()
- element_str = text[start:end]
- else:
- element_str = str_element
- start = -1
- end = -1
- else:
- element_str = str_element
- start = -1
- end = -1
- true_start = True
- if start > 0 and text[start - 1] != "\n":
- true_start = False
- # process previous text
- if start != -1 and last_position < start:
- text_content = text[last_position:start]
- result = split_and_append_text(result, text_content)
- if hasattr(element, "name") and element.name is not None:
- if (
- end < len(text)
- and end >= 0
- and (text[end] not in ["\n", " "] or element_str.endswith("\n"))
- ):
- next_block_pos = text.find("\n\n", end)
- if next_block_pos == -1:
- mix_region_end = len(text)
- else:
- mix_region_end = next_block_pos + 2
- j = i + 1
- while j < len(contents):
- next_element_str = str(contents[j])
- next_start = text.find(next_element_str, end)
- if next_start == -1 or next_start >= mix_region_end:
- break
- j += 1
- if true_start:
- # merge text and html
- result.append(
- ("text_with_html", text[start:mix_region_end].rstrip("\n"))
- )
- else:
- _, last_content = result[-1]
- result.pop()
- result.append(
- (
- "text_with_html",
- last_content + text[start:mix_region_end].rstrip("\n"),
- )
- )
- last_position = mix_region_end
- i = j
- else:
- # pure HTML block
- if true_start:
- result.append(("html", element_str))
- else:
- _, last_content = result[-1]
- result.pop()
- result.append(("html", last_content + element_str))
- last_position = end
- i += 1
- else:
- # normal text
- result = split_and_append_text(result, element_str)
- last_position = end if end != -1 else last_position + len(element_str)
- i += 1
- # process remaining text
- if last_position < len(text):
- text_content = text[last_position:]
- result = split_and_append_text(result, text_content)
- return result
- def split_and_append_text(result, text_content):
- """
- Split the text and append the result to the result list.
- Args:
- result (list): The current result list.
- text_content (str): The text content to be processed.
- Returns:
- list: The updated result list after processing the text content.
- """
- if text_content.strip():
- # match all code block interval
- code_pattern = re.compile(r"(```.*?\n.*?```|~~~.*?\n.*?~~~)", re.DOTALL)
- last_pos = 0
- for m in code_pattern.finditer(text_content):
- # process text before code block
- if m.start() > last_pos:
- non_code = text_content[last_pos : m.start()]
- paragraphs = re.split(r"\n{2,}", non_code)
- for p in paragraphs:
- if p.strip():
- result.append(("text", p.strip()))
- # process code block
- result.append(("code", m.group()))
- last_pos = m.end()
- # process remaining text
- if last_pos < len(text_content):
- non_code = text_content[last_pos:]
- paragraphs = re.split(r"\n{2,}", non_code)
- for p in paragraphs:
- if p.strip():
- result.append(("text", p.strip()))
- return result
|