Selaa lähdekoodia

Merge pull request #1164 from myhloli/dev

refactor(para): adjust line height multiplier for block splitting,fix(pre_proc): prevent errors when imageWriter is None
Xiaomeng Zhao 11 kuukautta sitten
vanhempi
commit
ed822634df
3 muutettua tiedostoa jossa 6 lisäystä ja 6 poistoa
  1. 2 2
      magic_pdf/data/utils.py
  2. 2 2
      magic_pdf/para/para_split_v3.py
  3. 2 2
      magic_pdf/pre_proc/cut_image.py

+ 2 - 2
magic_pdf/data/utils.py

@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
     mat = fitz.Matrix(dpi / 72, dpi / 72)
     pm = doc.get_pixmap(matrix=mat, alpha=False)
 
-    # If the width or height exceeds 9000 after scaling, do not scale further.
-    if pm.width > 9000 or pm.height > 9000:
+    # If the width or height exceeds 4500 after scaling, do not scale further.
+    if pm.width > 4500 or pm.height > 4500:
         pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
 
     img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)

+ 2 - 2
magic_pdf/para/para_split_v3.py

@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
             line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
             block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
             if (
-                line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
-                and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
+                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
             ):
                 external_sides_not_close_num += 1
             if abs(line_mid_x - block_mid_x) < line_height / 2:

+ 2 - 2
magic_pdf/pre_proc/cut_image.py

@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
     for span in spans:
         span_type = span['type']
         if span_type == ContentType.Image:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                 continue
             span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
                                            imageWriter=imageWriter)
         elif span_type == ContentType.Table:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                 continue
             span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
                                            imageWriter=imageWriter)