Răsfoiți Sursa

add para_to_standard_format logic

赵小蒙 1 an în urmă
părinte
comite
d3542f6a71
3 a modificat fișierele cu 55 adăugiri și 12 ștergeri
  1. 7 7
      magic_pdf/cli/magicpdf.py
  2. 46 4
      magic_pdf/dict2md/ocr_mkcontent.py
  3. 2 1
      magic_pdf/pipe/AbsPipe.py

+ 7 - 7
magic_pdf/cli/magicpdf.py

@@ -84,13 +84,13 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
         path=f"{pdf_file_name}.json",
         mode=AbsReaderWriter.MODE_TXT,
     )
-    # try:
-    #     content_list = pipe.pipe_mk_uni_format()
-    # except Exception as e:
-    #     logger.exception(e)
-    # md_writer.write(
-    #     str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
-    # )
+    try:
+        content_list = pipe.pipe_mk_uni_format()
+    except Exception as e:
+        logger.exception(e)
+    md_writer.write(
+        str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
+    )
 
 
 @click.group()

+ 46 - 4
magic_pdf/dict2md/ocr_mkcontent.py

@@ -201,16 +201,58 @@ def para_to_standard_format(para, img_buket_path):
     return para_content
 
 
+def para_to_standard_format_v2(para_block, img_buket_path):
+    para_type = para_block['type']
+    if para_type == BlockType.Text:
+        para_content = {
+            'type': 'text',
+            'text': merge_para_with_text(para_block),
+        }
+    elif para_type == BlockType.Title:
+        para_content = {
+            'type': 'text',
+            'text': merge_para_with_text(para_block),
+            'text_level': 1
+        }
+    elif para_type == BlockType.InterlineEquation:
+        para_content = {
+            'type': 'equation',
+            'text': merge_para_with_text(para_block),
+            'text_format': "latex"
+        }
+    elif para_type == BlockType.Image:
+        para_content = {
+            'type': 'image',
+        }
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.ImageBody:
+                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+            if block['type'] == BlockType.ImageCaption:
+                para_content['img_caption'] = merge_para_with_text(block)
+    elif para_type == BlockType.Table:
+        para_content = {
+            'type': 'table',
+        }
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.TableBody:
+                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+            if block['type'] == BlockType.TableCaption:
+                para_content['table_caption'] = merge_para_with_text(block)
+            if block['type'] == BlockType.TableFootnote:
+                para_content['table_footnote'] = merge_para_with_text(block)
+
+    return para_content
+
+
 def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
     content_list = []
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get("para_blocks")
         if not paras_of_layout:
             continue
-        for paras in paras_of_layout:
-            for para in paras:
-                para_content = para_to_standard_format(para, img_buket_path)
-                content_list.append(para_content)
+        for para_block in paras_of_layout:
+            para_content = para_to_standard_format_v2(para_block, img_buket_path)
+            content_list.append(para_content)
     return content_list
 
 

+ 2 - 1
magic_pdf/pipe/AbsPipe.py

@@ -92,7 +92,8 @@ class AbsPipe(ABC):
         parse_type = pdf_mid_data["_parse_type"]
         pdf_info_list = pdf_mid_data["pdf_info"]
         if parse_type == AbsPipe.PIP_TXT:
-            content_list = mk_universal_format(pdf_info_list, img_buket_path)
+            # content_list = mk_universal_format(pdf_info_list, img_buket_path)
+            content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
         elif parse_type == AbsPipe.PIP_OCR:
             content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
         return content_list