Przeglądaj źródła

Merge pull request #743 from myhloli/para-split-v3

refactor(para_split_v3): merge list and index block detection
Xiaomeng Zhao 1 rok temu
rodzic
commit
0d83fb77dd

+ 2 - 0
magic_pdf/libs/draw_bbox.py

@@ -237,6 +237,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
                 BlockType.Text,
                 BlockType.Title,
                 BlockType.InterlineEquation,
+                BlockType.List,
+                BlockType.Index,
             ]:
                 for line in block['lines']:
                     for span in line['spans']:

Plik diff jest za duży
+ 107 - 80
magic_pdf/para/para_split_v3.py


+ 1 - 1
magic_pdf/pdf_parse_union_core_v2.py

@@ -360,7 +360,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
                                                need_drop, drop_reason)
 
     '''将span填入blocks中'''
-    block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.3)
+    block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
 
     '''对block进行fix操作'''
     fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików