|
@@ -12,12 +12,13 @@ from collections import Counter
|
|
|
|
|
|
|
|
from magic_pdf.libs.drop_reason import DropReason
|
|
from magic_pdf.libs.drop_reason import DropReason
|
|
|
from magic_pdf.libs.language import detect_lang
|
|
from magic_pdf.libs.language import detect_lang
|
|
|
|
|
+from magic_pdf.libs.pdf_check import detect_invalid_chars
|
|
|
|
|
|
|
|
scan_max_page = 50
|
|
scan_max_page = 50
|
|
|
junk_limit_min = 10
|
|
junk_limit_min = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
-def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
|
|
|
|
|
|
|
+def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
|
|
|
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
|
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
|
|
result]
|
|
result]
|
|
|
page_area = int(page_width_pts) * int(page_height_pts)
|
|
page_area = int(page_width_pts) * int(page_height_pts)
|
|
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
|
|
|
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
|
|
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
|
|
|
return max_image_area_per_page
|
|
return max_image_area_per_page
|
|
|
|
|
|
|
|
|
|
+
|
|
|
def process_image(page, junk_img_bojids=[]):
|
|
def process_image(page, junk_img_bojids=[]):
|
|
|
- page_result = []# 存每个页面里的多张图四元组信息
|
|
|
|
|
|
|
+ page_result = [] # 存每个页面里的多张图四元组信息
|
|
|
items = page.get_images()
|
|
items = page.get_images()
|
|
|
dedup = set()
|
|
dedup = set()
|
|
|
for img in items:
|
|
for img in items:
|
|
|
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
|
|
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
|
|
|
- img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
|
|
|
|
- if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过
|
|
|
|
|
|
|
+ img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
|
|
|
|
+ if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
|
|
|
continue
|
|
continue
|
|
|
recs = page.get_image_rects(img, transform=True)
|
|
recs = page.get_image_rects(img, transform=True)
|
|
|
if recs:
|
|
if recs:
|
|
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
|
|
|
dedup.add((x0, y0, x1, y1, img_bojid))
|
|
dedup.add((x0, y0, x1, y1, img_bojid))
|
|
|
page_result.append([x0, y0, x1, y1, img_bojid])
|
|
page_result.append([x0, y0, x1, y1, img_bojid])
|
|
|
return page_result
|
|
return page_result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
|
"""
|
|
"""
|
|
|
返回每个页面里的图片的四元组,每个页面多个图片。
|
|
返回每个页面里的图片的四元组,每个页面多个图片。
|
|
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
|
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
|
|
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
|
|
|
# 找出出现次数超过 len(doc) 半数的 img_bojid
|
|
# 找出出现次数超过 len(doc) 半数的 img_bojid
|
|
|
|
|
|
|
|
- junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
|
|
|
|
|
|
|
+ junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
|
|
|
|
|
|
|
|
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
|
|
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
|
|
|
|
|
|
|
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
|
result.append(page_result)
|
|
result.append(page_result)
|
|
|
for item in result:
|
|
for item in result:
|
|
|
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
|
|
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
|
|
|
- if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break
|
|
|
|
|
|
|
+ if max(imgs_len_list) == min(imgs_len_list) and max(
|
|
|
|
|
+ imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
|
|
|
junk_img_bojids = []
|
|
junk_img_bojids = []
|
|
|
- else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
|
|
|
|
|
|
|
+ else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
|
|
|
pass
|
|
pass
|
|
|
break_loop = True
|
|
break_loop = True
|
|
|
break
|
|
break
|
|
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
|
# 检查前80%的元素是否都相等
|
|
# 检查前80%的元素是否都相等
|
|
|
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
|
|
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
|
|
|
|
|
|
|
|
- # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
|
|
|
|
|
- # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
|
|
|
|
|
|
|
+ # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
|
|
|
|
|
+ # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
|
|
|
|
|
|
|
|
#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
|
|
#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
|
|
|
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
|
|
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
|
|
|
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
|
|
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
|
|
|
junk_img_bojids = []
|
|
junk_img_bojids = []
|
|
|
- else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
|
|
|
|
|
|
|
+ else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
|
|
|
pass
|
|
pass
|
|
|
- else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
|
|
|
|
|
|
|
+ else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
|
|
|
junk_img_bojids = []
|
|
junk_img_bojids = []
|
|
|
|
|
|
|
|
#正式进入取前50页图片的信息流程
|
|
#正式进入取前50页图片的信息流程
|
|
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
|
|
|
median_width = page_width_list[len(page_width_list) // 2]
|
|
median_width = page_width_list[len(page_width_list) // 2]
|
|
|
median_height = page_height_list[len(page_height_list) // 2]
|
|
median_height = page_height_list[len(page_height_list) // 2]
|
|
|
|
|
|
|
|
-
|
|
|
|
|
return median_width, median_height
|
|
return median_width, median_height
|
|
|
|
|
|
|
|
|
|
|
|
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
|
|
|
|
|
|
|
|
return text_len_lst
|
|
return text_len_lst
|
|
|
|
|
|
|
|
|
|
+
|
|
|
def get_pdf_text_layout_per_page(doc: fitz.Document):
|
|
def get_pdf_text_layout_per_page(doc: fitz.Document):
|
|
|
"""
|
|
"""
|
|
|
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
|
|
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
|
|
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
|
|
|
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
|
|
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
|
|
|
return text_layout_list
|
|
return text_layout_list
|
|
|
|
|
|
|
|
|
|
+
|
|
|
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
|
|
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
class PageSvgsTooManyError(Exception):
|
|
class PageSvgsTooManyError(Exception):
|
|
|
def __init__(self, message="Page SVGs are too many"):
|
|
def __init__(self, message="Page SVGs are too many"):
|
|
|
self.message = message
|
|
self.message = message
|
|
|
super().__init__(self.message)
|
|
super().__init__(self.message)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def get_svgs_per_page(doc: fitz.Document):
|
|
def get_svgs_per_page(doc: fitz.Document):
|
|
|
svgs_len_list = []
|
|
svgs_len_list = []
|
|
|
for page_id, page in enumerate(doc):
|
|
for page_id, page in enumerate(doc):
|
|
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
|
|
|
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
|
|
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
|
|
|
return svgs_len_list
|
|
return svgs_len_list
|
|
|
|
|
|
|
|
|
|
+
|
|
|
def get_imgs_per_page(doc: fitz.Document):
|
|
def get_imgs_per_page(doc: fitz.Document):
|
|
|
imgs_len_list = []
|
|
imgs_len_list = []
|
|
|
for page_id, page in enumerate(doc):
|
|
for page_id, page in enumerate(doc):
|
|
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
|
|
|
return language
|
|
return language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def check_invalid_chars(pdf_bytes):
|
|
|
|
|
+ """
|
|
|
|
|
+ 乱码检测
|
|
|
|
|
+ """
|
|
|
|
|
+ return detect_invalid_chars(pdf_bytes)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def pdf_meta_scan(pdf_bytes: bytes):
|
|
def pdf_meta_scan(pdf_bytes: bytes):
|
|
|
"""
|
|
"""
|
|
|
:param s3_pdf_path:
|
|
:param s3_pdf_path:
|
|
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
|
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
|
|
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
|
|
|
text_language = get_language(doc)
|
|
text_language = get_language(doc)
|
|
|
# logger.info(f"text_language: {text_language}")
|
|
# logger.info(f"text_language: {text_language}")
|
|
|
-
|
|
|
|
|
|
|
+ invalid_chars = check_invalid_chars(pdf_bytes)
|
|
|
|
|
+ # logger.info(f"invalid_chars: {invalid_chars}")
|
|
|
|
|
|
|
|
# 最后输出一条json
|
|
# 最后输出一条json
|
|
|
res = {
|
|
res = {
|
|
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
|
# "svgs_per_page": svgs_per_page,
|
|
# "svgs_per_page": svgs_per_page,
|
|
|
"imgs_per_page": imgs_per_page, # 增加每页img数量list
|
|
"imgs_per_page": imgs_per_page, # 增加每页img数量list
|
|
|
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
|
|
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
|
|
|
|
|
+ "invalid_chars": invalid_chars,
|
|
|
"metadata": doc.metadata
|
|
"metadata": doc.metadata
|
|
|
}
|
|
}
|
|
|
# logger.info(json.dumps(res, ensure_ascii=False))
|
|
# logger.info(json.dumps(res, ensure_ascii=False))
|
|
@@ -365,4 +385,4 @@ if __name__ == '__main__':
|
|
|
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
|
|
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
|
|
|
# doc = fitz.open("pdf", file_content)
|
|
# doc = fitz.open("pdf", file_content)
|
|
|
# text_layout_lst = get_pdf_text_layout_per_page(doc)
|
|
# text_layout_lst = get_pdf_text_layout_per_page(doc)
|
|
|
- # print(text_layout_lst)
|
|
|
|
|
|
|
+ # print(text_layout_lst)
|