Explorar o código

bugfix: add # to start of paragraph title

gaotingquan hai 8 meses
pai
achega
8f27b1e7ae
Modificáronse 1 ficheiros con 20 adicións e 8 borrados
  1. 20 8
      paddlex/inference/pipelines/layout_parsing/result_v2.py

+ 20 - 8
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -253,24 +253,36 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
 
         def _format_data(obj):
 
-            def format_title(content_value):
+            def format_title(title):
                 """
-                Normalize chapter title by ensuring one space between numbering and title content.
+                Normalize chapter title.
+                Add the '#' to indicate the level of the title.
                 If numbering exists, ensure there's exactly one space between it and the title content.
                 If numbering does not exist, return the original title unchanged.
 
-                :param content_value: Original chapter title string.
+                :param title: Original chapter title string.
                 :return: Normalized chapter title string.
                 """
-                match = self.title_pattern.match(content_value)
+                match = self.title_pattern.match(title)
                 if match:
                     numbering = match.group(1).strip()
                     title_content = match.group(3).lstrip()
                     # Return numbering and title content separated by one space
-                    return numbering + " " + title_content
-                else:
-                    # No numbering detected; return original title
-                    return content_value
+                    title = numbering + " " + title_content
+
+                title = title.rstrip(".")
+                level = (
+                    title.count(
+                        ".",
+                    )
+                    + 1
+                    if "." in title
+                    else 1
+                )
+                return f"#{'#' * level} {title}".replace("-\n", "").replace(
+                    "\n",
+                    " ",
+                )
 
             def format_centered_text(key):
                 return (