Browse Source

fix(ocr_mkcontent): handle empty paragraphs on pages

- Add empty paragraph handling for pages with no content
- Append an empty markdown object when a page has no paragraphs
- Increment page number even if no content is present
myhloli 11 months ago
parent
commit
782e6571bc
1 changed files with 7 additions and 1 deletions
  1. 7 1
      magic_pdf/dict2md/ocr_mkcontent.py

+ 7 - 1
magic_pdf/dict2md/ocr_mkcontent.py

@@ -5,7 +5,6 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.para.para_split_v3 import ListLineTag
 
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
         if not paras_of_layout:
+            markdown_with_para_and_pagination.append({
+                'page_no':
+                    page_no,
+                'md_content':
+                    '',
+            })
+            page_no += 1
             continue
         page_markdown = ocr_mk_markdown_with_para_core_v2(
             paras_of_layout, 'mm', img_buket_path)