Browse Source

support concatenate_markdown_pages (#4622)

* add PP-DocLayoutV2 in official models

* support concatenate_markdown_pages
changdazhou 1 month ago
parent
commit
e26ed5d6a7
1 changed files with 17 additions and 0 deletions
  1. 17 0
      paddlex/inference/pipelines/paddleocr_vl/pipeline.py

+ 17 - 0
paddlex/inference/pipelines/paddleocr_vl/pipeline.py

@@ -675,6 +675,23 @@ class _PaddleOCRVLPipeline(BasePipeline):
                 if thread_vlm.is_alive():
                     logging.warning("VLM worker did not terminate in time")
 
+    def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
+        """
+        Concatenate Markdown content from multiple pages into a single document.
+
+        Args:
+            markdown_list (list): A list containing Markdown data for each page.
+
+        Returns:
+            tuple: A tuple containing the processed Markdown text.
+        """
+        markdown_texts = ""
+
+        for res in markdown_list:
+            markdown_texts += "\n\n" + res["markdown_texts"]
+
+        return markdown_texts
+
 
 @pipeline_requires_extra("ocr")
 class PaddleOCRVLPipeline(AutoParallelImageSimpleInferencePipeline):