瀏覽代碼

refactor(libs): remove unused imports and functions

- Remove unused imports from commons.py
- Delete unused functions related to AWS and S3 operations
- Update import statements in other modules to reflect changes in commons.py
- Remove redundant code and improve code readability
myhloli 11 月之前
父節點
當前提交
2db3c26374

+ 3 - 17
magic_pdf/filter/pdf_meta_scan.py

@@ -1,13 +1,12 @@
 """输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
 
-import sys
 from collections import Counter
 
-import click
+import fitz
 from loguru import logger
 
 from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
+from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.pdf_check import detect_invalid_chars
 
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
         return res
 
 
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """"""
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
-        logger.exception(e)
-
-
 if __name__ == '__main__':
-    main()
+    pass
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"

+ 0 - 161
magic_pdf/libs/commons.py

@@ -1,34 +1,8 @@
-import datetime
-import json
-import os, re, configparser
-import subprocess
-import time
-
-import boto3
-from loguru import logger
-from boto3.s3.transfer import TransferConfig
-from botocore.config import Config
-
-import fitz # 1.23.9中已经切换到rebase
-# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
-
-
-def get_delta_time(input_time):
-    return round(time.time() - input_time, 2)
-
 
 def join_path(*args):
     return '/'.join(str(s).rstrip('/') for s in args)
 
 
-#配置全局的errlog_path,方便demo同步引用
-error_log_path = "s3://llm-pdf-text/err_logs/"
-# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
-json_dump_path = "s3://llm-pdf-text/json_dump/"
-
-# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
-
-
 def get_top_percent_list(num_list, percent):
     """
     获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
     return top_percent_list
 
 
-def formatted_time(time_stamp):
-    dt_object = datetime.datetime.fromtimestamp(time_stamp)
-    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
-    return output_time
-
-
 def mymax(alist: list):
     if len(alist) == 0:
         return 0  # 空是0, 0*0也是0大小q
     else:
         return max(alist)
 
-def parse_aws_param(profile):
-    if isinstance(profile, str):
-        # 解析配置文件
-        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
-        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
-        config = configparser.ConfigParser()
-        config.read(credentials_file)
-        config.read(config_file)
-        # 获取 AWS 账户相关信息
-        ak = config.get(profile, "aws_access_key_id")
-        sk = config.get(profile, "aws_secret_access_key")
-        if profile == "default":
-            s3_str = config.get(f"{profile}", "s3")
-        else:
-            s3_str = config.get(f"profile {profile}", "s3")
-        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if end_match:
-            endpoint = end_match.group(1)
-        else:
-            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
-        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if style_match:
-            addressing_style = style_match.group(1)
-        else:
-            addressing_style = "path"
-    elif isinstance(profile, dict):
-        ak = profile["ak"]
-        sk = profile["sk"]
-        endpoint = profile["endpoint"]
-        addressing_style = "auto"
-
-    return ak, sk, endpoint, addressing_style
-
 
 def parse_bucket_key(s3_full_path: str):
     """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
         s3_full_path = s3_full_path[1:]
     bucket, key = s3_full_path.split("/", 1)
     return bucket, key
-
-
-def read_file(pdf_path: str, s3_profile):
-    if pdf_path.startswith("s3://"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
-        bucket_name, bucket_key = parse_bucket_key(pdf_path)
-        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
-        file_content = res["Body"].read()
-        return file_content
-    else:
-        with open(pdf_path, "rb") as f:
-            return f.read()
-
-
-def get_docx_model_output(pdf_model_output, page_id):
-
-    model_output_json = pdf_model_output[page_id]
-
-    return model_output_json
-
-
-def list_dir(dir_path:str, s3_profile:str):
-    """
-    列出dir_path下的所有文件
-    """
-    ret = []
-    
-    if dir_path.startswith("s3"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
-        bucket, path = s3info[0][0], s3info[0][1]
-        try:
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                                            config=Config(s3={'addressing_style': addressing_style}))
-            def list_obj_scluster():
-                marker = None
-                while True:
-                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
-                    if marker:
-                        list_kwargs['Marker'] = marker
-                    response = cli.list_objects(**list_kwargs)
-                    contents = response.get("Contents", [])
-                    yield from contents
-                    if not response.get("IsTruncated") or len(contents)==0:
-                        break
-                    marker = contents[-1]['Key']
-
-
-            for info in list_obj_scluster():
-                file_path = info['Key']
-                #size = info['Size']
-
-                if path!="":
-                    afile = file_path[len(path):]
-                    if afile.endswith(".json"):
-                        ret.append(f"s3://{bucket}/{file_path}")
-                        
-            return ret
-
-        except Exception as e:
-            logger.exception(e)
-            exit(-1)
-    else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
-        
-        for root, dirs, files in os.walk(dir_path):
-            for file in files:
-                if file.endswith(".json"):
-                    ret.append(join_path(root, file))
-        ret.sort()
-        return ret
-
-def get_img_s3_client(save_path:str, image_s3_config:str):
-    """
-    """
-    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
-        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
-        img_s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=end_point,
-            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
-        )
-    else:
-        img_s3_client = None
-        
-    return img_s3_client
-
-if __name__=="__main__":
-    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
-    s3_profile = "langchao"
-    ret = list_dir(s3_path, s3_profile)
-    print(ret)
-    

+ 2 - 3
magic_pdf/libs/draw_bbox.py

@@ -1,8 +1,7 @@
+import fitz
 from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
-                                               ContentType)
+from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
 from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.model.magic_model import MagicModel
 
 

+ 2 - 1
magic_pdf/libs/pdf_image_tools.py

@@ -1,9 +1,10 @@
 from io import BytesIO
 import cv2
+import fitz
 import numpy as np
 from PIL import Image
 from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.libs.commons import fitz, join_path
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256
 
 

+ 0 - 30
magic_pdf/model/magic_model.py

@@ -1,16 +1,12 @@
 import enum
-import json
 
 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
                                     bbox_relative_pos, box_area, calculate_iou,
                                     calculate_overlap_area_in_bbox1_area_ratio,
                                     get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
     def get_model_list(self, page_no):
         return self.__model_list[page_no]
 
-
-if __name__ == '__main__':
-    drw = FileBasedDataReader(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path)
-        model_json_txt = drw.read(model_file_path).decode()
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-
-    if 1:
-        from magic_pdf.data.dataset import PymuDocDataset
-
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
-
-        magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
-        for i in range(7):
-            print(magic_model.get_imgs(i))

+ 2 - 2
magic_pdf/pdf_parse_union_core_v2.py

@@ -5,6 +5,7 @@ import time
 from typing import List
 
 import torch
+import fitz
 from loguru import logger
 
 from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.data.dataset import Dataset, PageableData
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.commons import fitz, get_delta_time
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
@@ -784,7 +784,7 @@ def pdf_parse_union(
         if debug_mode:
             time_now = time.time()
             logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
+                f'page_id: {page_id}, last_page_cost_time: {time.time() - start_time}'
             )
             start_time = time_now
 

+ 1 - 1
magic_pdf/rw/S3ReaderWriter.py

@@ -1,5 +1,5 @@
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
+from magic_pdf.libs.commons import parse_bucket_key, join_path
 import boto3
 from loguru import logger
 from botocore.config import Config

+ 1 - 1
tests/unittest/test_metascan_classify/test_commons.py.bak

@@ -2,10 +2,10 @@ import io
 import json
 import os
 
+import fitz
 import boto3
 from botocore.config import Config
 
-from magic_pdf.libs.commons import fitz
 from magic_pdf.libs.config_reader import get_s3_config_dict
 
 from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key