zhengchun
/
MinerU


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
							import hashlib
import mimetypes
import urllib.parse


def is_pdf(filename, file):
    """
    判断文件是否为PDF格式，支持中文名和特殊字符。

    :param filename: 文件名
    :param file: 文件对象
    :return: 如果文件是PDF格式，则返回True，否则返回False
    """
    try:
        # 对文件名进行URL解码，处理特殊字符
        decoded_filename = urllib.parse.unquote(filename)
        
        # 检查MIME类型
        mime_type, _ = mimetypes.guess_type(decoded_filename)
        print(f"Detected MIME type: {mime_type}")
        
        # 某些情况下mime_type可能为None，需要特殊处理
        if mime_type is None:
            # 只检查文件内容的PDF标识
            file_start = file.read(5)
            file.seek(0)  # 重置文件指针
            return file_start.startswith(b'%PDF-')
            
        if mime_type != 'application/pdf':
            return False

        # 检查文件内容的PDF标识
        file_start = file.read(5)
        file.seek(0)  # 重置文件指针
        if not file_start.startswith(b'%PDF-'):
            return False

        return True
        
    except Exception as e:
        print(f"Error checking PDF format: {str(e)}")
        # 发生错误时，仍然尝试通过文件头判断
        try:
            file_start = file.read(5)
            file.seek(0)
            return file_start.startswith(b'%PDF-')
        except:
            return False


def url_is_pdf(file):
    """
    判断文件是否为PDF格式。

    :param file: 文件对象
    :return: 如果文件是PDF格式，则返回True，否则返回False
    """
    # 检查文件内容
    file_start = file.read(5)
    file.seek(0)
    if not file_start.startswith(b'%PDF-'):
        return False

    return True


def calculate_file_hash(file, algorithm='sha256'):
    """
    计算给定文件的哈希值。

    :param file: 文件对象
    :param algorithm: 哈希算法的名字，如:'sha256', 'md5', 'sha1'等
    :return: 文件的哈希值
    """
    hash_func = getattr(hashlib, algorithm)()
    block_size = 65536  # 64KB chunks
    # with open(file_path, 'rb') as file:
    buffer = file.read(block_size)
    while len(buffer) > 0:
        hash_func.update(buffer)
        buffer = file.read(block_size)
    file.seek(0)
    return hash_func.hexdigest()


def singleton_func(cls):
    instance = {}

    def _singleton(*args, **kwargs):
        if cls not in instance:
            instance[cls] = cls(*args, **kwargs)
        return instance[cls]

    return _singleton