Преглед на файлове

feat(pdf_parse): filter out skewed text lines

- Add direction filtering to ignore highly skewed text lines
- Improve text extraction accuracy by focusing on non-skewed content
myhloli преди 11 месеца
родител
ревизия
37da8c44c4
променени са 1 файла, в които са добавени 3 реда и са изтрити 1 реда
  1. 3 1
      magic_pdf/pdf_parse_union_core_v2.py

+ 3 - 1
magic_pdf/pdf_parse_union_core_v2.py

@@ -139,10 +139,12 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
     text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
 
-    # @todo: 拿到char之后把倾斜角度较大的先删一遍
     all_pymu_chars = []
     for block in text_blocks_raw:
         for line in block['lines']:
+            cosine, sine = line['dir']
+            if abs (cosine) < 0.9 or abs(sine) > 0.1:
+                continue
             for span in line['spans']:
                 all_pymu_chars.extend(span['chars'])