1 år sedan · df14c61f6f
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -305,7 +305,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
 
				 
			
 
				 
			
 
				 def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
			
 
				-             text_layout_list: list):
			
 
				+             text_layout_list: list, invalid_chars: bool):
			
 
				     """
			
 
				     这里的图片和页面长度单位是pts
			
 
				     :param total_page:
			
@@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
 
				         'by_avg_words': classify_by_avg_words(text_len_list),
			
 
				         'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
			
 
				         'by_text_layout': classify_by_text_layout(text_layout_list),
			
 
				-        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
			
 
				+        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
			
 
				+        'by_invalid_chars': invalid_chars,
			
 
				     }
			
 
				 
			
 
				     if all(results.values()):
			
@@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
 
				         return False, results
			
 
				     else:
			
 
				         logger.warning(
			
 
				-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
			
 
				+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
			
 
				+            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
			
 
				+            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
			
 
				+            f" by_invalid_chars: {results['by_invalid_chars']}",
			
 
				             file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
			
 
				         return False, results
			
 
				 
			
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -12,12 +12,13 @@ from collections import Counter
 
				 
			
 
				 from magic_pdf.libs.drop_reason import DropReason
			
 
				 from magic_pdf.libs.language import detect_lang
			
 
				+from magic_pdf.libs.pdf_check import detect_invalid_chars
			
 
				 
			
 
				 scan_max_page = 50
			
 
				 junk_limit_min = 10
			
 
				 
			
 
				 
			
 
				-def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
			
 
				+def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
			
 
				     max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
			
 
				                                result]
			
 
				     page_area = int(page_width_pts) * int(page_height_pts)
			
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
 
				     max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
			
 
				     return max_image_area_per_page
			
 
				 
			
 
				+
			
 
				 def process_image(page, junk_img_bojids=[]):
			
 
				-    page_result = []# 存每个页面里的多张图四元组信息
			
 
				+    page_result = []  # 存每个页面里的多张图四元组信息
			
 
				     items = page.get_images()
			
 
				     dedup = set()
			
 
				     for img in items:
			
 
				         # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
			
 
				-        img_bojid = img[0]# 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
			
 
				-        if img_bojid in junk_img_bojids:# 如果是垃圾图像，就跳过
			
 
				+        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
			
 
				+        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
			
 
				             continue
			
 
				         recs = page.get_image_rects(img, transform=True)
			
 
				         if recs:
			
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
 
				             dedup.add((x0, y0, x1, y1, img_bojid))
			
 
				             page_result.append([x0, y0, x1, y1, img_bojid])
			
 
				     return page_result
			
 
				+
			
 
				+
			
 
				 def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
			
 
				     """
			
 
				     返回每个页面里的图片的四元组，每个页面多个图片。
			
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
 
				     img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
			
 
				     # 找出出现次数超过 len(doc) 半数的 img_bojid
			
 
				 
			
 
				-    junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
			
 
				+    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
			
 
				 
			
 
				     junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
			
 
				 
			
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
 
				         result.append(page_result)
			
 
				         for item in result:
			
 
				             if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
			
 
				-                if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版，就把junklist置空并break
			
 
				+                if max(imgs_len_list) == min(imgs_len_list) and max(
			
 
				+                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
			
 
				                     junk_img_bojids = []
			
 
				-                else:# 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
			
 
				+                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
			
 
				                     pass
			
 
				                 break_loop = True
			
 
				                 break
			
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
 
				         # 检查前80%的元素是否都相等
			
 
				         if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
			
 
				 
			
 
				-        # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
			
 
				-        # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
			
 
				+            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
			
 
				+            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
			
 
				 
			
 
				             #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
			
 
				             max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
			
 
				             if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
			
 
				                 junk_img_bojids = []
			
 
				-            else:# 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
			
 
				+            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
			
 
				                 pass
			
 
				-        else:# 每页图片数量不一致，需要清掉junklist全量跑前50页图片
			
 
				+        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
			
 
				             junk_img_bojids = []
			
 
				 
			
 
				     #正式进入取前50页图片的信息流程
			
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
 
				     median_width = page_width_list[len(page_width_list) // 2]
			
 
				     median_height = page_height_list[len(page_height_list) // 2]
			
 
				 
			
 
				-
			
 
				     return median_width, median_height
			
 
				 
			
 
				 
			
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
 
				 
			
 
				     return text_len_lst
			
 
				 
			
 
				+
			
 
				 def get_pdf_text_layout_per_page(doc: fitz.Document):
			
 
				     """
			
 
				     根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
			
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
 
				         # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
			
 
				     return text_layout_list
			
 
				 
			
 
				+
			
 
				 '''定义一个自定义异常用来抛出单页svg太多的pdf'''
			
 
				+
			
 
				+
			
 
				 class PageSvgsTooManyError(Exception):
			
 
				     def __init__(self, message="Page SVGs are too many"):
			
 
				         self.message = message
			
 
				         super().__init__(self.message)
			
 
				+
			
 
				+
			
 
				 def get_svgs_per_page(doc: fitz.Document):
			
 
				     svgs_len_list = []
			
 
				     for page_id, page in enumerate(doc):
			
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
 
				         # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
			
 
				     return svgs_len_list
			
 
				 
			
 
				+
			
 
				 def get_imgs_per_page(doc: fitz.Document):
			
 
				     imgs_len_list = []
			
 
				     for page_id, page in enumerate(doc):
			
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
 
				     return language
			
 
				 
			
 
				 
			
 
				+def check_invalid_chars(pdf_bytes):
			
 
				+    """
			
 
				+    乱码检测
			
 
				+    """
			
 
				+    return detect_invalid_chars(pdf_bytes)
			
 
				+
			
 
				+
			
 
				 def pdf_meta_scan(pdf_bytes: bytes):
			
 
				     """
			
 
				     :param s3_pdf_path:
			
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
 
				         # logger.info(f"text_layout_per_page: {text_layout_per_page}")
			
 
				         text_language = get_language(doc)
			
 
				         # logger.info(f"text_language: {text_language}")
			
 
				-
			
 
				+        invalid_chars = check_invalid_chars(pdf_bytes)
			
 
				+        # logger.info(f"invalid_chars: {invalid_chars}")
			
 
				 
			
 
				         # 最后输出一条json
			
 
				         res = {
			
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
 
				             # "svgs_per_page": svgs_per_page,
			
 
				             "imgs_per_page": imgs_per_page,  # 增加每页img数量list
			
 
				             "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
			
 
				+            "invalid_chars": invalid_chars,
			
 
				             "metadata": doc.metadata
			
 
				         }
			
 
				         # logger.info(json.dumps(res, ensure_ascii=False))
			
@@ -365,4 +385,4 @@ if __name__ == '__main__':
 
				     # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
			
 
				     # doc = fitz.open("pdf", file_content)
			
 
				     # text_layout_lst = get_pdf_text_layout_per_page(doc)
			
 
				-    # print(text_layout_lst)
			
 
				+    # print(text_layout_lst)
			
--- a/magic_pdf/libs/pdf_check.py
+++ b/magic_pdf/libs/pdf_check.py
@@ -0,0 +1,59 @@
 
				+from io import BytesIO
			
 
				+import re
			
 
				+import fitz
			
 
				+import numpy as np
			
 
				+from loguru import logger
			
 
				+from pdfminer.high_level import extract_text
			
 
				+
			
 
				+
			
 
				+def calculate_sample_count(total_page: int, sample_ratio=0.1):
			
 
				+    """
			
 
				+    根据总页数和采样率计算采样页面的数量。
			
 
				+    """
			
 
				+    select_page_cnt = int(total_page * sample_ratio)
			
 
				+    if select_page_cnt < 5:
			
 
				+        select_page_cnt = min(10, total_page)
			
 
				+    elif select_page_cnt > 10:
			
 
				+        select_page_cnt = 10
			
 
				+    return select_page_cnt
			
 
				+
			
 
				+
			
 
				+def extract_pages(src_pdf_bytes: bytes):
			
 
				+    pdf_docs = fitz.open("pdf", src_pdf_bytes)
			
 
				+    total_page = len(pdf_docs)
			
 
				+    if total_page == 0:
			
 
				+        # 如果PDF没有页面，直接返回空文档
			
 
				+        logger.warning("PDF is empty, return empty document")
			
 
				+        return fitz.Document()
			
 
				+    select_page_cnt = calculate_sample_count(total_page)
			
 
				+
			
 
				+    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
			
 
				+    sample_docs = fitz.Document()
			
 
				+    try:
			
 
				+        for index in page_num:
			
 
				+            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
			
 
				+    except Exception as e:
			
 
				+        logger.exception(e)
			
 
				+    return sample_docs
			
 
				+
			
 
				+
			
 
				+def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
			
 
				+    """"
			
 
				+    检测PDF中是否包含非法字符
			
 
				+    """
			
 
				+    '''需要使用'''
			
 
				+    sample_docs = extract_pages(src_pdf_bytes)
			
 
				+    sample_pdf_bytes = sample_docs.tobytes()
			
 
				+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
			
 
				+    text = extract_text(sample_pdf_file_like_object)
			
 
				+    # logger.info(text)
			
 
				+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
			
 
				+    cid_pattern = re.compile(r'\(cid:\d+\)')
			
 
				+    matches = cid_pattern.findall(text)
			
 
				+    cid_count = len(matches)
			
 
				+    text_len = len(text)
			
 
				+    logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
			
 
				+    if cid_count > 10:
			
 
				+        return False  # 乱码文档
			
 
				+    else:
			
 
				+        return True   # 正常文档
			
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -83,6 +83,7 @@ class AbsPipe(ABC):
 
				                     pdf_meta["text_len_per_page"],
			
 
				                     pdf_meta["imgs_per_page"],
			
 
				                     pdf_meta["text_layout_per_page"],
			
 
				+                    pdf_meta["invalid_chars"],
			
 
				                 )
			
 
				                 if is_text_pdf:
			
 
				                     return AbsPipe.PIP_TXT
			
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -86,45 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
 
				             return None
			
 
				 
			
 
				     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
			
 
				-    text_all = ""
			
 
				-    for page_dict in pdf_info_dict['pdf_info']:
			
 
				-        for para_block in page_dict['para_blocks']:
			
 
				-            if para_block['type'] in ['title', 'text']:
			
 
				-                for line in para_block['lines']:
			
 
				-                    for span in line['spans']:
			
 
				-                        text_all += span['content']
			
 
				-
			
 
				-    def calculate_not_common_character_rate(text):
			
 
				-        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
			
 
				-        # 计算乱码字符的数量
			
 
				-        garbage_count = len(garbage_regex.findall(text))
			
 
				-        total = len(text)
			
 
				-        if total == 0:
			
 
				-            return 0  # 避免除以零的错误
			
 
				-        return garbage_count / total
			
 
				-
			
 
				-    def calculate_not_printable_rate(text):
			
 
				-        printable_text = ""
			
 
				-        for c in text:
			
 
				-            if c.isprintable():
			
 
				-                printable_text += c
			
 
				-        printable_total = len(printable_text)
			
 
				-        total = len(text)
			
 
				-        if total == 0:
			
 
				-            return 0  # 避免除以零的错误
			
 
				-        return (total - printable_total) / total
			
 
				-
			
 
				-    not_common_character_rate = calculate_not_common_character_rate(text_all)
			
 
				-    not_printable_rate = calculate_not_printable_rate(text_all)
			
 
				-    pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
			
 
				-    pdf_info_dict["_not_printable_rate"] = not_printable_rate
			
 
				-    logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
			
 
				+    # text_all = ""
			
 
				+    # for page_dict in pdf_info_dict['pdf_info']:
			
 
				+    #     for para_block in page_dict['para_blocks']:
			
 
				+    #         if para_block['type'] in ['title', 'text']:
			
 
				+    #             for line in para_block['lines']:
			
 
				+    #                 for span in line['spans']:
			
 
				+    #                     text_all += span['content']
			
 
				+
			
 
				+    # def calculate_not_common_character_rate(text):
			
 
				+    #     garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
			
 
				+    #     # 计算乱码字符的数量
			
 
				+    #     garbage_count = len(garbage_regex.findall(text))
			
 
				+    #     total = len(text)
			
 
				+    #     if total == 0:
			
 
				+    #         return 0  # 避免除以零的错误
			
 
				+    #     return garbage_count / total
			
 
				+    #
			
 
				+    # def calculate_not_printable_rate(text):
			
 
				+    #     printable_text = ""
			
 
				+    #     for c in text:
			
 
				+    #         if c.isprintable():
			
 
				+    #             printable_text += c
			
 
				+    #     printable_total = len(printable_text)
			
 
				+    #     total = len(text)
			
 
				+    #     if total == 0:
			
 
				+    #         return 0  # 避免除以零的错误
			
 
				+    #     return (total - printable_total) / total
			
 
				+    #
			
 
				+    # not_common_character_rate = calculate_not_common_character_rate(text_all)
			
 
				+    # not_printable_rate = calculate_not_printable_rate(text_all)
			
 
				+    # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
			
 
				+    # pdf_info_dict["_not_printable_rate"] = not_printable_rate
			
 
				+    # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
			
 
				+    '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
			
 
				     # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
			
 
				     if (pdf_info_dict is None
			
 
				-        or pdf_info_dict.get("_need_drop", False)
			
 
				-        or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
			
 
				+            or pdf_info_dict.get("_need_drop", False)
			
 
				+            # or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
			
 
				     ):
			
 
				-        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
			
 
				+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
			
 
				         if input_model_is_empty:
			
 
				             pdf_models = doc_analyze(pdf_bytes, ocr=True)
			
 
				         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,5 @@ wordninja>=2.0.0
 
				 scikit-learn>=1.0.2
			
 
				 nltk==3.8.1
			
 
				 s3pathlib>=2.1.1
			
 
				-paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
			
 
				+paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
			
 
				+pdfminer.six>=20231228