Эх сурвалжийг харах

update AVG_TEXT_LEN_THRESHOLD 200->100

赵小蒙 1 жил өмнө
parent
commit
084dc22ab1

+ 25 - 17
magic_pdf/filter/pdf_classify_by_type.py

@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
 from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
 
 TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 200
+AVG_TEXT_LEN_THRESHOLD = 100
 TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计
 
 
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
             # 如果宽达标,检测是否能竖着拼
             if full_width:
                 # 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
+                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
+                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
 
             # 如果高达标,检测是否可以横着拼
             if full_height:
                 # 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
+                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
+                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
 
             # Check if the image can be merged with the last image
             if (full_width and close1) or (full_height and close2):
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
     # 先对每个id出现的次数做个统计
     objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
     # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
+    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
         total_page = scan_max_page
 
-
     repeat_threshold = 2  # 把bad_image的阈值设为2
     # repeat_threshold = min(2, total_page)  # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
     bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
     # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
     #     return True
 
-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list]  # 过滤掉重复出现的图片
-
+    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
+                   img_sz_list]  # 过滤掉重复出现的图片
 
     # 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
     img_sz_list = merge_images(img_sz_list, page_width, page_height)
 
     # 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               img_sz_list]
     page_area = page_width * page_height
     max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
     max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
 
-    if len(max_image_area_per_page) >= 0.5 * total_page:   # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
+    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
         # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
         return False
     else:
         return True
 
 
-
 def classify_by_text_len(text_len_list: list, total_page: int):
     """
     随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
     is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
     return is_text_pdf
 
+
 def classify_by_avg_words(text_len_list: list):
     """
     补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
 
     return is_text_pdf
 
+
 def classify_by_img_num(img_sz_list: list, img_num_list: list):
     """
     补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
     # img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
     if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
 
-    #拿max和min的值,用来判断list内的值是否全都相等
-    # min_imgs = min(img_num_list)
-    # max_imgs = max(img_num_list)
-    #
-    # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
+        #拿max和min的值,用来判断list内的值是否全都相等
+        # min_imgs = min(img_num_list)
+        # max_imgs = max(img_num_list)
+        #
+        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
         return False  # 如果满足这个条件,一定不是文字版pdf
     else:
         return True  # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
     else:
         return False  # 文本布局未知,默认认为不是文字版pdf
 
+
 def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
     """
     判断一页是否由细长条组成,有两个条件:
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
     Returns:
         bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
     """
+
     def is_narrow_strip(img):
         x0, y0, x1, y1, _ = img
         width, height = x1 - x0, y1 - y0
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
     return narrow_strip_pages_ratio < 0.5
 
 
-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
+def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
+             text_layout_list: list):
     """
     这里的图片和页面长度单位是pts
     :param total_page:
@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
     elif not any(results.values()):
         return False, results
     else:
-        logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
+        logger.warning(
+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
+            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
         return False, results