8 月之前 · 034034c6e0
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
@@ -1,37 +1,51 @@
 
				 import hashlib
			
 
				 import mimetypes
			
 
				+import urllib.parse
			
 
				 
			
 
				 
			
 
				 def is_pdf(filename, file):
			
 
				     """
			
 
				-    判断文件是否为PDF格式。
			
 
				+    判断文件是否为PDF格式，支持中文名和特殊字符。
			
 
				 
			
 
				     :param filename: 文件名
			
 
				     :param file: 文件对象
			
 
				     :return: 如果文件是PDF格式，则返回True，否则返回False
			
 
				     """
			
 
				-    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况，先注释
			
 
				-    # if not filename.endswith('.pdf'):
			
 
				-    #     return False
			
 
				-
			
 
				-    # 检查MIME类型
			
 
				-    mime_type, _ = mimetypes.guess_type(filename)
			
 
				-    print(mime_type)
			
 
				-    if mime_type != 'application/pdf':
			
 
				-        return False
			
 
				-
			
 
				-    # 可选：读取文件的前几KB内容并检查MIME类型
			
 
				-    # 这一步是可选的，用于更严格的检查
			
 
				-    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
			
 
				-    #     return False
			
 
				-
			
 
				-    # 检查文件内容
			
 
				-    file_start = file.read(5)
			
 
				-    file.seek(0)
			
 
				-    if not file_start.startswith(b'%PDF-'):
			
 
				-        return False
			
 
				-
			
 
				-    return True
			
 
				+    try:
			
 
				+        # 对文件名进行URL解码，处理特殊字符
			
 
				+        decoded_filename = urllib.parse.unquote(filename)
			
 
				+        
			
 
				+        # 检查MIME类型
			
 
				+        mime_type, _ = mimetypes.guess_type(decoded_filename)
			
 
				+        print(f"Detected MIME type: {mime_type}")
			
 
				+        
			
 
				+        # 某些情况下mime_type可能为None，需要特殊处理
			
 
				+        if mime_type is None:
			
 
				+            # 只检查文件内容的PDF标识
			
 
				+            file_start = file.read(5)
			
 
				+            file.seek(0)  # 重置文件指针
			
 
				+            return file_start.startswith(b'%PDF-')
			
 
				+            
			
 
				+        if mime_type != 'application/pdf':
			
 
				+            return False
			
 
				+
			
 
				+        # 检查文件内容的PDF标识
			
 
				+        file_start = file.read(5)
			
 
				+        file.seek(0)  # 重置文件指针
			
 
				+        if not file_start.startswith(b'%PDF-'):
			
 
				+            return False
			
 
				+
			
 
				+        return True
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"Error checking PDF format: {str(e)}")
			
 
				+        # 发生错误时，仍然尝试通过文件头判断
			
 
				+        try:
			
 
				+            file_start = file.read(5)
			
 
				+            file.seek(0)
			
 
				+            return file_start.startswith(b'%PDF-')
			
 
				+        except:
			
 
				+            return False
			
 
				 
			
 
				 
			
 
				 def url_is_pdf(file):