test_multi_bucket_s3.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. import json
  2. import os
  3. import fitz
  4. import pytest
  5. from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
  6. MultiBucketS3DataWriter)
  7. from magic_pdf.data.schemas import S3Config
  8. @pytest.mark.skipif(
  9. os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
  10. )
  11. def test_multi_bucket_s3_reader_writer():
  12. """test multi bucket s3 reader writer must config s3 config in the
  13. environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
  14. S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
  15. export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
  16. """
  17. bucket = os.getenv('S3_BUCKET', '')
  18. ak = os.getenv('S3_ACCESS_KEY', '')
  19. sk = os.getenv('S3_SECRET_KEY', '')
  20. endpoint_url = os.getenv('S3_ENDPOINT', '')
  21. bucket_2 = os.getenv('S3_BUCKET_2', '')
  22. ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
  23. sk_2 = os.getenv('S3_SECRET_KEY_2', '')
  24. endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
  25. s3configs = [
  26. S3Config(
  27. bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
  28. ),
  29. S3Config(
  30. bucket_name=bucket_2,
  31. access_key=ak_2,
  32. secret_key=sk_2,
  33. endpoint_url=endpoint_url_2,
  34. ),
  35. ]
  36. reader = MultiBucketS3DataReader(bucket, s3configs)
  37. writer = MultiBucketS3DataWriter(bucket, s3configs)
  38. bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
  39. assert bits == reader.read(
  40. f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
  41. )
  42. bits = reader.read(
  43. f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
  44. )
  45. docs = fitz.open('pdf', bits)
  46. assert len(docs) == 10
  47. bits = reader.read(
  48. 'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
  49. )
  50. assert bits == reader.read_at(
  51. 'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
  52. )
  53. assert len(json.loads(bits)) > 0
  54. writer.write_string(
  55. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
  56. )
  57. assert 'abc'.encode() == reader.read(
  58. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  59. )
  60. writer.write(
  61. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
  62. '123'.encode(),
  63. )
  64. assert '123'.encode() == reader.read(
  65. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
  66. )
  67. @pytest.mark.skipif(
  68. os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
  69. )
  70. def test_multi_bucket_s3_reader_writer_with_prefix():
  71. """test multi bucket s3 reader writer must config s3 config in the
  72. environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
  73. S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
  74. export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
  75. """
  76. bucket = os.getenv('S3_BUCKET', '')
  77. ak = os.getenv('S3_ACCESS_KEY', '')
  78. sk = os.getenv('S3_SECRET_KEY', '')
  79. endpoint_url = os.getenv('S3_ENDPOINT', '')
  80. bucket_2 = os.getenv('S3_BUCKET_2', '')
  81. ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
  82. sk_2 = os.getenv('S3_SECRET_KEY_2', '')
  83. endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
  84. s3configs = [
  85. S3Config(
  86. bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
  87. ),
  88. S3Config(
  89. bucket_name=bucket_2,
  90. access_key=ak_2,
  91. secret_key=sk_2,
  92. endpoint_url=endpoint_url_2,
  93. ),
  94. ]
  95. prefix = 'meta-index'
  96. reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
  97. writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
  98. bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
  99. assert bits == reader.read(
  100. f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
  101. )
  102. bits = reader.read(
  103. f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
  104. )
  105. docs = fitz.open('pdf', bits)
  106. assert len(docs) == 10
  107. bits = reader.read(
  108. 'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
  109. )
  110. assert bits == reader.read_at(
  111. 'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
  112. )
  113. assert len(json.loads(bits)) > 0
  114. writer.write_string(
  115. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
  116. )
  117. assert 'abc'.encode() == reader.read(
  118. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  119. )
  120. assert 'abc'.encode() == reader.read(
  121. f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  122. )
  123. writer.write(
  124. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
  125. '123'.encode(),
  126. )
  127. assert '123'.encode() == reader.read(
  128. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
  129. )