瀏覽代碼

update:cleanup requirements.txt

赵小蒙 1 年之前
父節點
當前提交
6c656af65f
共有 3 個文件被更改,包括 28 次插入20 次删除
  1. 13 4
      magic_pdf/libs/path_utils.py
  2. 14 0
      requirements-qa.txt
  3. 1 16
      requirements.txt

+ 13 - 4
magic_pdf/libs/path_utils.py

@@ -1,7 +1,5 @@
 
 
-from s3pathlib import S3Path
-
 def remove_non_official_s3_args(s3path):
     """
     example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
     return arr[0]
 
 def parse_s3path(s3path: str):
-    p = S3Path(remove_non_official_s3_args(s3path))
-    return p.bucket, p.key
+    # from s3pathlib import S3Path
+    # p = S3Path(remove_non_official_s3_args(s3path))
+    # return p.bucket, p.key
+    s3path = remove_non_official_s3_args(s3path).strip()
+    if s3path.startswith(('s3://', 's3a://')):
+        prefix, path = s3path.split('://', 1)
+        bucket_name, key = path.split('/', 1)
+        return bucket_name, key
+    elif s3path.startswith('/'):
+        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
+    else:
+        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
+
 
 def parse_s3_range_params(s3path: str):
     """

+ 14 - 0
requirements-qa.txt

@@ -0,0 +1,14 @@
+Levenshtein
+nltk
+rapidfuzz
+statistics
+openxlab #安装opendatalab
+pandas
+numpy
+matplotlib
+seaborn
+scipy
+scikit-learn
+tqdm
+htmltabletomd
+pypandoc

+ 1 - 16
requirements.txt

@@ -1,26 +1,11 @@
 boto3>=1.28.43
 Brotli>=1.1.0
 click>=8.1.7
-Distance>=0.1.3
 PyMuPDF>=1.24.7
 loguru>=0.6.0
-matplotlib>=3.8.3
 numpy>=1.21.6
-pandas>=1.3.5
 fast-langdetect>=0.1.1
-regex>=2023.12.25
-termcolor>=2.4.0
 wordninja>=2.0.0
 scikit-learn>=1.0.2
-nltk==3.8.1
-s3pathlib>=2.1.1
 pdfminer.six>=20231228
-Levenshtein
-rapidfuzz
-statistics
-openxlab #安装opendatalab
-seaborn
-scipy
-tqdm
-htmltabletomd
-pypandoc
+# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员