Browse Source

feat: rewrite code snippet

xu rui 11 months ago
parent
commit
3cd51d4941

+ 89 - 61
next_docs/en/user_guide/data/data_reader_writer.rst

@@ -87,56 +87,70 @@ Read Examples
 
 .. code:: python
 
+    import os 
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # file based related 
+    # file based related
     file_based_reader1 = FileBasedDataReader('')
 
-    ## will read file abc 
-    file_based_reader1.read('abc') 
+    ## will read file abc
+    file_based_reader1.read('abc')
 
     file_based_reader2 = FileBasedDataReader('/tmp')
 
     ## will read /tmp/abc
     file_based_reader2.read('abc')
 
-    ## will read /var/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    ## will read /tmp/logs/message.txt
+    file_based_reader2.read('/tmp/logs/message.txt')
 
     # multi bucket s3 releated
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = 'test/unittest'
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
         ),
         S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
             access_key=ak_2,
             secret_key=sk_2,
             endpoint_url=endpoint_url_2,
         )])
-    
-    ## will read s3://test_bucket1/test_prefix/abc
+
+    ## will read s3://{bucket}/{test_prefix}/abc
     multi_bucket_s3_reader1.read('abc')
 
-    ## will read s3://test_bucket1/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## will read s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
 
-    ## will read s3://test_bucket2/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    ## will read s3://{bucket2}/{test_prefix}/abc
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
 
     # s3 related
     s3_reader1 = S3DataReader(
-        "test_prefix",
-        "test_bucket",
-        "ak",
-        "sk",
-        "localhost"
+        test_prefix,
+        bucket,
+        ak,
+        sk,
+        endpoint_url
     )
 
-    ## will read s3://test_bucket/test_prefix/abc 
+    ## will read s3://{bucket}/{test_prefix}/abc
     s3_reader1.read('abc')
 
-    ## will read s3://test_bucket/efg
-    s3_reader1.read('s3://test_bucket/efg')
+    ## will read s3://{bucket}/efg
+    s3_reader1.read(f's3://{bucket}/efg')
 
 
 Write Examples
@@ -144,65 +158,79 @@ Write Examples
 
 .. code:: python
 
+    import os
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config
 
-    # file based related 
-    file_based_writer1 = FileBasedDataWriter('')
+    # file based related
+    file_based_writer1 = FileBasedDataWriter("")
 
     ## will write 123 to abc
-    file_based_writer1.write('abc', '123'.encode()) 
+    file_based_writer1.write("abc", "123".encode())
 
     ## will write 123 to abc
-    file_based_writer1.write_string('abc', '123') 
+    file_based_writer1.write_string("abc", "123")
 
-    file_based_writer2 = FileBasedDataWriter('/tmp')
+    file_based_writer2 = FileBasedDataWriter("/tmp")
 
     ## will write 123 to /tmp/abc
-    file_based_writer2.write_string('abc', '123')
+    file_based_writer2.write_string("abc", "123")
 
-    ## will write 123 to /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
+    ## will write 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
 
     # multi bucket s3 releated
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=test_bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-    
-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            ),
+            S3Config(
+                bucket_name=bucket_2,
+                access_key=ak_2,
+                secret_key=sk_2,
+                endpoint_url=endpoint_url_2,
+            ),
+        ],
+    )
+
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write_string("abc", "123")
 
-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write("abc", "123".encode())
 
-    ## will write 123 to s3://test_bucket1/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
 
-    ## will write 123 to s3://test_bucket2/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    ## will write 123 to s3://{bucket_2}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
 
     # s3 related
-    s3_writer1 = S3DataWriter(
-        "test_prefix",
-        "test_bucket",
-        "ak",
-        "sk",
-        "localhost"
-    )
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
+
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write("abc", "123".encode())
 
-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write_string("abc", "123")
 
-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write_string('abc', '123')
+    ## will write 123 to s3://{bucket}/efg
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
 
-    ## will write 123 to s3://test_bucket/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
 
 
 Check :doc:`../../api/data_reader_writer` for more details

+ 2 - 2
next_docs/en/user_guide/data/read_api.rst

@@ -80,10 +80,10 @@ Read images from path or directory
     from magic_pdf.data.read_api import *
 
     # read from image path 
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # replace with real file path
 
     # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # replace with real directory 
 
 
 Check :doc:`../../api/read_api` for more details

+ 103 - 75
next_docs/zh_cn/user_guide/data/data_reader_writer.rst

@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
 ---------
 .. code:: python
 
-    from magic_pdf.data.data_reader_writer import * 
+    import os 
+    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # 文件相关的
+    # 初始化 reader
     file_based_reader1 = FileBasedDataReader('')
 
-    ## 将读取文件 abc 
-    file_based_reader1.read('abc') 
+    ## 读本地文件 abc
+    file_based_reader1.read('abc')
 
     file_based_reader2 = FileBasedDataReader('/tmp')
 
-    ## 将读取 /tmp/abc
+    ## 读本地文件 /tmp/abc
     file_based_reader2.read('abc')
 
-    ## 将读取 /var/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    ## 读本地文件 /tmp/logs/message.txt
+    file_based_reader2.read('/tmp/logs/message.txt')
+
+    # 初始化多桶 s3 reader
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
 
-    # 多桶 S3 相关的
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    test_prefix = 'test/unittest'
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
         ),
         S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
             access_key=ak_2,
             secret_key=sk_2,
             endpoint_url=endpoint_url_2,
         )])
 
-    ## 将读取 s3://test_bucket1/test_prefix/abc
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
     multi_bucket_s3_reader1.read('abc')
 
-    ## 将读取 s3://test_bucket1/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## 读文件 s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
 
-    ## 将读取 s3://test_bucket2/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    ## 读文件 s3://{bucket2}/{test_prefix}/abc
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
 
-    # S3 相关的
+    # 初始化 s3 reader
     s3_reader1 = S3DataReader(
-        "test_prefix",
-        "test_bucket",
-        "ak",
-        "sk",
-        "localhost"
+        test_prefix,
+        bucket,
+        ak,
+        sk,
+        endpoint_url
     )
 
-    ## 将读取 s3://test_bucket/test_prefix/abc 
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
     s3_reader1.read('abc')
 
-    ## 将读取 s3://test_bucket/efg
-    s3_reader1.read('s3://test_bucket/efg')
+    ## 读文件 s3://{bucket}/efg
+    s3_reader1.read(f's3://{bucket}/efg')
+
 
 写入示例
 ----------
 .. code:: python
 
+    import os
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config
+
+    # 初始化 reader
+    file_based_writer1 = FileBasedDataWriter("")
+
+    ## 写数据 123 to abc
+    file_based_writer1.write("abc", "123".encode())
+
+    ## 写数据 123 to abc
+    file_based_writer1.write_string("abc", "123")
+
+    file_based_writer2 = FileBasedDataWriter("/tmp")
+
+    ## 写数据 123 to /tmp/abc
+    file_based_writer2.write_string("abc", "123")
+
+    ## 写数据 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
+
+    # 初始化多桶 s3 writer
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
+
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            ),
+            S3Config(
+                bucket_name=bucket_2,
+                access_key=ak_2,
+                secret_key=sk_2,
+                endpoint_url=endpoint_url_2,
+            ),
+        ],
+    )
 
-    # 文件相关的
-    file_based_writer1 = FileBasedDataWriter('')
-
-    ## 将写入 123 到 abc
-    file_based_writer1.write('abc', '123'.encode()) 
-
-    ## 将写入 123 到 abc
-    file_based_writer1.write_string('abc', '123') 
-
-    file_based_writer2 = FileBasedDataWriter('/tmp')
-
-    ## 将写入 123 到 /tmp/abc
-    file_based_writer2.write_string('abc', '123')
-
-    ## 将写入 123 到 /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
-
-    # 多桶 S3 相关的
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=test_bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write_string("abc", "123")
 
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write("abc", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket1/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket2/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
 
-    # S3 相关的
-    s3_writer1 = S3DataWriter(
-        "test_prefix",
-        "test_bucket",
-        "ak",
-        "sk",
-        "localhost"
-    )
+    # 初始化 s3 writer
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
 
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
-    s3_writer1.write('abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write("abc", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
-    s3_writer1.write_string('abc', '123')
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write_string("abc", "123")
 
-    ## 将写入 123 到 s3://test_bucket/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/efg
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
 

+ 4 - 4
next_docs/zh_cn/user_guide/data/read_api.rst

@@ -61,10 +61,10 @@ read_local_pdfs
     from magic_pdf.data.read_api import *
 
     # 读取 PDF 路径
-    datasets = read_local_pdfs("tt.pdf")
+    datasets = read_local_pdfs("tt.pdf")  # 替换为有效的文件
 
     # 读取目录下的 PDF 文件
-    datasets = read_local_pdfs("pdfs/")
+    datasets = read_local_pdfs("pdfs/")   # 替换为有效的文件目录
 
 read_local_images
 ^^^^^^^^^^^^^^^^^^^
@@ -76,7 +76,7 @@ read_local_images
     from magic_pdf.data.read_api import *
 
     # 从图像路径读取
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # 替换为有效的文件
 
     # 从目录读取以 suffixes 数组中指定后缀结尾的文件
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # 替换为有效的文件目录