zhengchun
/
PaddleX


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
							# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re


def _is_sentence_dot(text, i):
    """
    Check if the given character is a sentence ending punctuation.
    """
    # if the character is not a period, return False
    if text[i] != ".":
        return False
    # previous character
    prev = text[i - 1] if i > 0 else ""
    # next character
    next = text[i + 1] if i + 1 < len(text) else ""
    # previous is digit or letter, then not sentence ending punctuation
    if prev.isdigit() or prev.isalpha():
        return False
    # next is digit or letter, then not sentence ending punctuation
    if next.isdigit() or next.isalpha():
        return False
    # next is a punctuation, then sentence ending punctuation
    if next in ("", " ", "\t", "\n", '"', "'", "”", "’", ")", "】", "」", "》"):
        return True
    return False


def _find_split_pos(text, chunk_size):
    """
    Find the position to split the text into two chunks.

    Args:
        text (str): The original text to be split.
        chunk_size (int): The maximum size of each chunk.

    Returns:
        int: The index where the text should be split.
    """
    center = len(text) // 2
    split_chars = ["\n", "。", ";", "；", "!", "！", "?", "？"]

    # Search forward
    for i in range(center, len(text)):
        if text[i] in split_chars:
            # Check for whitespace around the split character
            j = i + 1
            while j < len(text) and text[j] in " \t\n":
                j += 1
            if j < len(text) and len(text[:j]) <= chunk_size:
                return i, j
        elif text[i] == "." and _is_sentence_dot(text, i):
            j = i + 1
            while j < len(text) and text[j] in " \t\n":
                j += 1
            if j < len(text) and len(text[:j]) <= chunk_size:
                return i, j

    # Search backward
    for i in range(center, 0, -1):
        if text[i] in split_chars:
            j = i + 1
            while j < len(text) and text[j] in " \t\n":
                j += 1
            if len(text[:j]) <= chunk_size:
                return i, j
        elif text[i] == "." and _is_sentence_dot(text, i):
            j = i + 1
            while j < len(text) and text[j] in " \t\n":
                j += 1
            if len(text[:j]) <= chunk_size:
                return i, j

    # If no suitable position is found, split directly
    return min(chunk_size, len(text)), min(chunk_size, len(text))


def split_text_recursive(text, chunk_size, translate_func):
    """
    Split the text recursively and translate each chunk.

    Args:
        text (str): The original text to be split.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    """
    text = text.strip()
    if len(text) <= chunk_size:
        return translate_func(text)
    else:
        split_pos, end_whitespace = _find_split_pos(text, chunk_size)
        left = text[:split_pos]
        right = text[end_whitespace:]
        whitespace = text[split_pos:end_whitespace]

        if left:
            left_text = split_text_recursive(left, chunk_size, translate_func)
        if right:
            right_text = split_text_recursive(right, chunk_size, translate_func)

        return left_text + whitespace + right_text


def translate_code_block(code_block, chunk_size, translate_func, results):
    """
    Translate a code block and append the result to the results list.

    Args:
        code_block (str): The code block to be translated.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    """
    lines = code_block.strip().split("\n")
    if lines[0].startswith("```") or lines[0].startswith("~~~"):
        header = lines[0]
        footer = (
            lines[-1]
            if (lines[-1].startswith("```") or lines[-1].startswith("~~~"))
            else ""
        )
        code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:])
    else:
        header = ""
        footer = ""
        code_content = code_block

    translated_code_lines = split_text_recursive(
        code_content, chunk_size, translate_func
    )

    # drop ``` or ~~~
    filtered_code_lines = [
        line
        for line in translated_code_lines.split("\n")
        if not (line.strip().startswith("```") or line.strip().startswith("~~~"))
    ]
    translated_code = "\n".join(filtered_code_lines)

    result = f"{header}\n{translated_code}\n{footer}" if header else translated_code
    results.append(result)


def translate_html_block(html_block, chunk_size, translate_func, results):
    """
    Translate a HTML block and append the result to the results list.

    Args:
        html_block (str): The HTML block to be translated.
        chunk_size (int): The maximum size of each chunk.
        translate_func (callable): A function that translates a single chunk of text.
        results (list): A list to store the translated chunks.

    Returns:
        None
    """
    from bs4 import BeautifulSoup

    # if this is a short and simple tag, just translate it
    if (
        html_block.count("<") < 5
        and html_block.count(">") < 5
        and html_block.count("<") == html_block.count(">")
        and len(html_block) < chunk_size
    ):
        translated = translate_func(html_block)
        results.append(translated)
        return

    soup = BeautifulSoup(html_block, "html.parser")

    # collect text nodes
    text_nodes = []
    for node in soup.find_all(string=True, recursive=True):
        text = node.strip()
        if text:
            text_nodes.append(node)

    idx = 0
    total = len(text_nodes)
    while idx < total:
        batch_nodes = []
        li_texts = []
        current_length = len("<ol></ol>")
        while idx < total:
            node_text = text_nodes[idx].strip()
            if len(node_text) > chunk_size:
                # if node_text is too long, split it
                translated_text = split_text_recursive(
                    node_text, chunk_size, translate_func
                )
                # concatenate translated lines with \n
                text_nodes[idx].replace_with(translated_text)
                idx += 1
                continue
            li_str = f"<li>{node_text}</li>"
            if current_length + len(li_str) > chunk_size:
                break
            batch_nodes.append(text_nodes[idx])
            li_texts.append(li_str)
            current_length += len(li_str)
            idx += 1
        if not batch_nodes:
            # if all individual nodes are longer than chunk_size, translate it alone
            node_text = text_nodes[idx - 1].strip()
            li_str = f"<li>{node_text}</li>"
            batch_nodes = [text_nodes[idx - 1]]
            li_texts = [li_str]

        if batch_nodes:
            batch_text = "<ol>" + "".join(li_texts) + "</ol>"
            translated = translate_func(batch_text)
            trans_soup = BeautifulSoup(translated, "html.parser")
            translated_lis = trans_soup.find_all("li")
            for orig_node, li_tag in zip(batch_nodes, translated_lis):
                orig_node.replace_with(li_tag.decode_contents())

    results.append(str(soup))


def split_original_texts(text):
    """
    Split the original text into chunks.
    """
    from bs4 import BeautifulSoup

    # find all html blocks and replace them with placeholders
    soup = BeautifulSoup(text, "html.parser")
    html_blocks = []
    html_placeholders = []
    placeholder_fmt = "<<HTML_BLOCK_{}>>"
    text_after_placeholder = ""

    index = 0
    for elem in soup.contents:
        if hasattr(elem, "name") and elem.name is not None:
            html_str = str(elem)
            placeholder = placeholder_fmt.format(index)
            html_blocks.append(html_str)
            html_placeholders.append(placeholder)
            text_after_placeholder += placeholder
            index += 1
        else:
            text_after_placeholder += str(elem)

    # split text into paragraphs
    splited_block = []
    splited_block = split_and_append_text(splited_block, text_after_placeholder)

    # replace placeholders with html blocks
    current_index = 0
    for idx, block in enumerate(splited_block):
        _, content = block
        while (
            current_index < len(html_placeholders)
            and html_placeholders[current_index] in content
        ):
            content = content.replace(
                html_placeholders[current_index], html_blocks[current_index]
            )
            current_index += 1
            splited_block[idx] = ("html", content)

    return splited_block


def split_and_append_text(result, text_content):
    """
    Split the text and append the result to the result list.

    Args:
        result (list): The current result list.
        text_content (str): The text content to be processed.

    Returns:
        list: The updated result list after processing the text content.
    """
    if text_content.strip():
        # match all code block interval
        code_pattern = re.compile(r"(```.*?\n.*?```|~~~.*?\n.*?~~~)", re.DOTALL)
        last_pos = 0
        for m in code_pattern.finditer(text_content):
            # process text before code block
            if m.start() > last_pos:
                non_code = text_content[last_pos : m.start()]
                paragraphs = re.split(r"\n{2,}", non_code)
                for p in paragraphs:
                    if p.strip():
                        result.append(("text", p.strip()))
            # process code block
            result.append(("code", m.group()))
            last_pos = m.end()
        # process remaining text
        if last_pos < len(text_content):
            non_code = text_content[last_pos:]
            paragraphs = re.split(r"\n{2,}", non_code)
            for p in paragraphs:
                if p.strip():
                    result.append(("text", p.strip()))
    return result