Browse Source

feat(tools): add character bounding box drawing functionality

- Add `draw_char_bbox` function to `draw_bbox.py` for drawing character bounding boxes
- Integrate `draw_char_bbox` into `common.py` for use in PDF processing pipeline
- Include option to draw character bounding boxes in debug mode
myhloli 10 months ago
parent
commit
f911a102ab
2 changed files with 14 additions and 12 deletions
  1. 8 12
      magic_pdf/libs/draw_bbox.py
  2. 6 0
      magic_pdf/tools/common.py

+ 8 - 12
magic_pdf/libs/draw_bbox.py

@@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
     pdf_docs.save(f'{out_path}/{filename}')
     pdf_docs.save(f'{out_path}/{filename}')
 
 
 
 
-def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
-    layout_bbox_list = []
-
-    for page in pdf_info:
-        page_block_list = []
-        for block in page['para_blocks']:
-            bbox = block['bbox']
-            page_block_list.append(bbox)
-        layout_bbox_list.append(page_block_list)
+def draw_char_bbox(pdf_bytes, out_path, filename):
     pdf_docs = fitz.open('pdf', pdf_bytes)
     pdf_docs = fitz.open('pdf', pdf_bytes)
     for i, page in enumerate(pdf_docs):
     for i, page in enumerate(pdf_docs):
-        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
-
-    pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')
+        for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
+            for line in block['lines']:
+                for span in line['spans']:
+                    for char in span['chars']:
+                        char_bbox = char['bbox']
+                        page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
+    pdf_docs.save(f'{out_path}/{filename}')

+ 6 - 0
magic_pdf/tools/common.py

@@ -9,6 +9,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.libs.draw_bbox import draw_char_bbox
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.operators.models import InferenceResult
 from magic_pdf.operators.models import InferenceResult
 
 
@@ -83,6 +84,7 @@ def do_parse(
     f_make_md_mode=MakeMode.MM_MD,
     f_make_md_mode=MakeMode.MM_MD,
     f_draw_model_bbox=False,
     f_draw_model_bbox=False,
     f_draw_line_sort_bbox=False,
     f_draw_line_sort_bbox=False,
+    f_draw_char_bbox=False,
     start_page_id=0,
     start_page_id=0,
     end_page_id=None,
     end_page_id=None,
     lang=None,
     lang=None,
@@ -94,6 +96,7 @@ def do_parse(
         logger.warning('debug mode is on')
         logger.warning('debug mode is on')
         f_draw_model_bbox = True
         f_draw_model_bbox = True
         f_draw_line_sort_bbox = True
         f_draw_line_sort_bbox = True
+        # f_draw_char_bbox = True
 
 
     pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
     pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
         pdf_bytes, start_page_id, end_page_id
         pdf_bytes, start_page_id, end_page_id
@@ -205,6 +208,9 @@ def do_parse(
             os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
             os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
         )
         )
 
 
+    if f_draw_char_bbox:
+        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
+
     if f_dump_md:
     if f_dump_md:
         pipe_result.dump_md(
         pipe_result.dump_md(
             md_writer,
             md_writer,