utils.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import re
  15. def _find_split_pos(text, chunk_size):
  16. """
  17. Find the position to split the text into two chunks.
  18. Args:
  19. text (str): The original text to be split.
  20. chunk_size (int): The maximum size of each chunk.
  21. Returns:
  22. int: The index where the text should be split.
  23. """
  24. center = len(text) // 2
  25. # Search forward
  26. for i in range(center, len(text)):
  27. if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
  28. if i + 1 < len(text) and len(text[: i + 1]) <= chunk_size:
  29. return i + 1
  30. # Search backward
  31. for i in range(center, 0, -1):
  32. if text[i] in ["\n", ".", "。", ";", ";", "!", "!", "?", "?"]:
  33. if len(text[: i + 1]) <= chunk_size:
  34. return i + 1
  35. # If no suitable position is found, split directly
  36. return min(chunk_size, len(text))
  37. def split_text_recursive(text, chunk_size, translate_func, results):
  38. """
  39. Split the text recursively and translate each chunk.
  40. Args:
  41. text (str): The original text to be split.
  42. chunk_size (int): The maximum size of each chunk.
  43. translate_func (callable): A function that translates a single chunk of text.
  44. results (list): A list to store the translated chunks.
  45. Returns:
  46. None
  47. """
  48. text = text.strip()
  49. if len(text) <= chunk_size:
  50. results.append(translate_func(text))
  51. else:
  52. split_pos = _find_split_pos(text, chunk_size)
  53. left = text[:split_pos].strip()
  54. right = text[split_pos:].strip()
  55. if left:
  56. split_text_recursive(left, chunk_size, translate_func, results)
  57. if right:
  58. split_text_recursive(right, chunk_size, translate_func, results)
  59. def translate_code_block(code_block, chunk_size, translate_func, results):
  60. """
  61. Translate a code block and append the result to the results list.
  62. Args:
  63. code_block (str): The code block to be translated.
  64. chunk_size (int): The maximum size of each chunk.
  65. translate_func (callable): A function that translates a single chunk of text.
  66. results (list): A list to store the translated chunks.
  67. Returns:
  68. None
  69. """
  70. lines = code_block.strip().split("\n")
  71. if lines[0].startswith("```") or lines[0].startswith("~~~"):
  72. header = lines[0]
  73. footer = (
  74. lines[-1]
  75. if (lines[-1].startswith("```") or lines[-1].startswith("~~~"))
  76. else ""
  77. )
  78. code_content = "\n".join(lines[1:-1]) if footer else "\n".join(lines[1:])
  79. else:
  80. header = ""
  81. footer = ""
  82. code_content = code_block
  83. translated_code_lines = []
  84. split_text_recursive(
  85. code_content, chunk_size, translate_func, translated_code_lines
  86. )
  87. # drop ``` or ~~~
  88. filtered_code_lines = [
  89. line
  90. for line in translated_code_lines
  91. if not (line.strip().startswith("```") or line.strip().startswith("~~~"))
  92. ]
  93. translated_code = "\n".join(filtered_code_lines)
  94. result = f"{header}\n{translated_code}\n{footer}" if header else translated_code
  95. results.append(result)
  96. def translate_html_block(html_block, chunk_size, translate_func, results):
  97. """
  98. Translate a HTML block and append the result to the results list.
  99. Args:
  100. html_block (str): The HTML block to be translated.
  101. chunk_size (int): The maximum size of each chunk.
  102. translate_func (callable): A function that translates a single chunk of text.
  103. results (list): A list to store the translated chunks.
  104. Returns:
  105. None
  106. """
  107. from bs4 import BeautifulSoup
  108. soup = BeautifulSoup(html_block, "html.parser")
  109. # collect text nodes
  110. text_nodes = []
  111. for node in soup.find_all(string=True, recursive=True):
  112. text = node.strip()
  113. if text:
  114. text_nodes.append(node)
  115. idx = 0
  116. total = len(text_nodes)
  117. while idx < total:
  118. batch_nodes = []
  119. li_texts = []
  120. current_length = len("<ol></ol>")
  121. while idx < total:
  122. node_text = text_nodes[idx].strip()
  123. if len(node_text) > chunk_size:
  124. # if node_text is too long, split it
  125. translated_lines = []
  126. split_text_recursive(
  127. node_text, chunk_size, translate_func, translated_lines
  128. )
  129. # concatenate translated lines with \n
  130. text_nodes[idx].replace_with("\n".join(translated_lines))
  131. idx += 1
  132. continue
  133. li_str = f"<li>{node_text}</li>"
  134. if current_length + len(li_str) > chunk_size:
  135. break
  136. batch_nodes.append(text_nodes[idx])
  137. li_texts.append(li_str)
  138. current_length += len(li_str)
  139. idx += 1
  140. if not batch_nodes:
  141. # if all individual nodes are longer than chunk_size, translate it alone
  142. node_text = text_nodes[idx - 1].strip()
  143. li_str = f"<li>{node_text}</li>"
  144. batch_nodes = [text_nodes[idx - 1]]
  145. li_texts = [li_str]
  146. if batch_nodes:
  147. batch_text = "<ol>" + "".join(li_texts) + "</ol>"
  148. translated = translate_func(batch_text)
  149. trans_soup = BeautifulSoup(translated, "html.parser")
  150. translated_lis = trans_soup.find_all("li")
  151. for orig_node, li_tag in zip(batch_nodes, translated_lis):
  152. orig_node.replace_with(li_tag.decode_contents())
  153. results.append(str(soup))
  154. def split_original_texts(text):
  155. """
  156. Split the original text into chunks.
  157. Args:
  158. text (str): The original text to be split.
  159. Returns:
  160. list: A list of strings representing the chunks of the original text.
  161. """
  162. from bs4 import BeautifulSoup
  163. soup = BeautifulSoup(text, "html.parser")
  164. result = []
  165. last_position = 0
  166. contents = soup.contents
  167. i = 0
  168. while i < len(contents):
  169. element = contents[i]
  170. str_element = str(element)
  171. if len(str_element) == 0:
  172. i += 1
  173. continue
  174. # find element in original text
  175. start = text.find(str_element, last_position)
  176. if start != -1:
  177. end = start + len(str_element)
  178. element_str = text[start:end]
  179. else:
  180. # if element is not a tag, try to find it in original text
  181. if hasattr(element, "name") and element.name is not None:
  182. tag = element.name
  183. pat = r"<{tag}.*?>.*?</{tag}>".format(tag=tag)
  184. re_html = re.compile(pat, re.DOTALL)
  185. match = re_html.search(text, last_position)
  186. if match:
  187. start = match.start()
  188. end = match.end()
  189. element_str = text[start:end]
  190. else:
  191. element_str = str_element
  192. start = -1
  193. end = -1
  194. else:
  195. element_str = str_element
  196. start = -1
  197. end = -1
  198. true_start = True
  199. if start > 0 and text[start - 1] != "\n":
  200. true_start = False
  201. # process previous text
  202. if start != -1 and last_position < start:
  203. text_content = text[last_position:start]
  204. result = split_and_append_text(result, text_content)
  205. if hasattr(element, "name") and element.name is not None:
  206. if (
  207. end < len(text)
  208. and end >= 0
  209. and (text[end] not in ["\n", " "] or element_str.endswith("\n"))
  210. ):
  211. next_block_pos = text.find("\n\n", end)
  212. if next_block_pos == -1:
  213. mix_region_end = len(text)
  214. else:
  215. mix_region_end = next_block_pos + 2
  216. j = i + 1
  217. while j < len(contents):
  218. next_element_str = str(contents[j])
  219. next_start = text.find(next_element_str, end)
  220. if next_start == -1 or next_start >= mix_region_end:
  221. break
  222. j += 1
  223. if true_start:
  224. # merge text and html
  225. result.append(
  226. ("text_with_html", text[start:mix_region_end].rstrip("\n"))
  227. )
  228. else:
  229. _, last_content = result[-1]
  230. result.pop()
  231. result.append(
  232. (
  233. "text_with_html",
  234. last_content + text[start:mix_region_end].rstrip("\n"),
  235. )
  236. )
  237. last_position = mix_region_end
  238. i = j
  239. else:
  240. # pure HTML block
  241. if true_start:
  242. result.append(("html", element_str))
  243. else:
  244. _, last_content = result[-1]
  245. result.pop()
  246. result.append(("html", last_content + element_str))
  247. last_position = end
  248. i += 1
  249. else:
  250. # normal text
  251. result = split_and_append_text(result, element_str)
  252. last_position = end if end != -1 else last_position + len(element_str)
  253. i += 1
  254. # process remaining text
  255. if last_position < len(text):
  256. text_content = text[last_position:]
  257. result = split_and_append_text(result, text_content)
  258. return result
  259. def split_and_append_text(result, text_content):
  260. """
  261. Split the text and append the result to the result list.
  262. Args:
  263. result (list): The current result list.
  264. text_content (str): The text content to be processed.
  265. Returns:
  266. list: The updated result list after processing the text content.
  267. """
  268. if text_content.strip():
  269. # match all code block interval
  270. code_pattern = re.compile(r"(```.*?\n.*?```|~~~.*?\n.*?~~~)", re.DOTALL)
  271. last_pos = 0
  272. for m in code_pattern.finditer(text_content):
  273. # process text before code block
  274. if m.start() > last_pos:
  275. non_code = text_content[last_pos : m.start()]
  276. paragraphs = re.split(r"\n{2,}", non_code)
  277. for p in paragraphs:
  278. if p.strip():
  279. result.append(("text", p.strip()))
  280. # process code block
  281. result.append(("code", m.group()))
  282. last_pos = m.end()
  283. # process remaining text
  284. if last_pos < len(text_content):
  285. non_code = text_content[last_pos:]
  286. paragraphs = re.split(r"\n{2,}", non_code)
  287. for p in paragraphs:
  288. if p.strip():
  289. result.append(("text", p.strip()))
  290. return result