Forráskód Böngészése

Merge pull request #1139 from myhloli/dev

fix(ocr_mkcontent): handle empty paragraphs on pages
Xiaomeng Zhao 11 hónapja
szülő
commit
086b48b7ae
1 módosított fájl, 7 hozzáadás és 1 törlés
  1. 7 1
      magic_pdf/dict2md/ocr_mkcontent.py

+ 7 - 1
magic_pdf/dict2md/ocr_mkcontent.py

@@ -5,7 +5,6 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.para.para_split_v3 import ListLineTag
 from magic_pdf.para.para_split_v3 import ListLineTag
 
 
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
     for page_info in pdf_info_dict:
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
         paras_of_layout = page_info.get('para_blocks')
         if not paras_of_layout:
         if not paras_of_layout:
+            markdown_with_para_and_pagination.append({
+                'page_no':
+                    page_no,
+                'md_content':
+                    '',
+            })
+            page_no += 1
             continue
             continue
         page_markdown = ocr_mk_markdown_with_para_core_v2(
         page_markdown = ocr_mk_markdown_with_para_core_v2(
             paras_of_layout, 'mm', img_buket_path)
             paras_of_layout, 'mm', img_buket_path)