Ver Fonte

添加debug模式

kernel.h@qq.com há 1 ano atrás
pai
commit
1f45e0ab4b

+ 1 - 2
magic_pdf/cli/magicpdf.py

@@ -35,7 +35,6 @@ from magic_pdf.libs.path_utils import (
 from magic_pdf.libs.config_reader import get_local_dir
 from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter, MODE_BIN, MODE_TXT
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.libs.json_compressor import JsonCompressor
 
 
 parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
@@ -54,7 +53,7 @@ def prepare_env():
 
 
 def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
-    uni_pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir)
+    uni_pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
     jso_useful_key = {
         "_pdf_type": "txt",
         "model_list": model_list,

+ 13 - 2
magic_pdf/model/magic_model.py

@@ -17,6 +17,11 @@ class MagicModel():
     def get_imgs(self, page_no:int): # @许瑞
         
         return_lst = []
+        
+        image_block = {
+            
+        }
+        
         img = {
         "bbox":[x0,y0,x1,y1]
         }
@@ -24,10 +29,16 @@ class MagicModel():
         "bbox":[x0,y0,x1,y1],
         "text":"",
         }
-        return [{"img":img, "caption":img_caption},]
+        
+        image_block['bbox'] = [x0, y0, x1, y1]# 计算出来
+        image_block['img_body'] = img
+        image_blcok['img_caption'] = img_caption
+        
+        
+        return [image_block,]
         
     def get_tables(self, page_no:int) ->list: # 3个坐标, caption, table主体,table-note
-        pass # 许瑞
+        pass # 许瑞, 结构和image一样
         
     def get_equations(self, page_no:int)->list: # 有坐标,也有字
         return inline_equations, interline_equations  # @凯文

+ 2 - 2
magic_pdf/para/para_split.py

@@ -549,7 +549,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
                 if "Table" in first_line_text or "Figure" in first_line_text:
                     pass
                 if debug_mode:
-                    logger.info(line_hi.std())
+                    logger.debug(line_hi.std())
                 
                 if line_hi.std()<2:
                     """行高度相同,那么判断是否居中"""
@@ -562,7 +562,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
                         merge_para = [l[0] for l in layout_para[start:end+1]]
                         para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
                         if debug_mode:
-                            logger.info(para_text)
+                            logger.debug(para_text)
                         layout_para[start:end+1] = [merge_para]
                         index_offset -= end-start
                         

+ 2 - 1
magic_pdf/pipe/AbsPipe.py

@@ -16,12 +16,13 @@ class AbsPipe(ABC):
     PIP_OCR = "ocr"
     PIP_TXT = "txt"
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, ):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.image_writer = image_writer
         self.img_parent_path = img_parent_path
         self.pdf_mid_data = None # 未压缩
+        self.is_debug = is_debug
     
     def get_compress_pdf_mid_data(self):
         return JsonCompressor.compress_json(self.pdf_mid_data)

+ 3 - 3
magic_pdf/pipe/OCRPipe.py

@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_ocr_pdf
 
 class OCRPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
 
     def pipe_classify(self):
         pass
 
     def pipe_parse(self):
-        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
+        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self):
         content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)

+ 3 - 3
magic_pdf/pipe/TXTPipe.py

@@ -6,14 +6,14 @@ from magic_pdf.user_api import parse_txt_pdf
 
 class TXTPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
 
     def pipe_classify(self):
         pass
 
     def pipe_parse(self):
-        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
+        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self):
         content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)

+ 4 - 4
magic_pdf/pipe/UNIPipe.py

@@ -15,18 +15,18 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug: bool = False):
         self.pdf_type = self.PIP_OCR
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
 
     def pipe_classify(self):
         self.pdf_type = UNIPipe.classify(self.pdf_bytes)
 
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
-            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer)
+            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
         elif self.pdf_type == self.PIP_OCR:
-            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
+            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self):
         content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)