test_s3.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import json
  2. import os
  3. import pytest
  4. from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
  5. @pytest.mark.skipif(
  6. os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
  7. )
  8. def test_s3_reader_writer():
  9. """test multi bucket s3 reader writer must config s3 config in the
  10. environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
  11. S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
  12. bucket = os.getenv('S3_BUCKET', '')
  13. ak = os.getenv('S3_ACCESS_KEY', '')
  14. sk = os.getenv('S3_SECRET_KEY', '')
  15. endpoint_url = os.getenv('S3_ENDPOINT', '')
  16. reader = S3DataReader('', bucket, ak, sk, endpoint_url)
  17. writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
  18. bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
  19. assert bits == reader.read(
  20. f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
  21. )
  22. bits = reader.read(
  23. 'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
  24. )
  25. assert bits == reader.read_at(
  26. 'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
  27. )
  28. assert len(json.loads(bits)) > 0
  29. writer.write_string(
  30. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
  31. )
  32. assert 'abc'.encode() == reader.read(
  33. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  34. )
  35. writer.write(
  36. f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
  37. '123'.encode(),
  38. )
  39. assert '123'.encode() == reader.read(
  40. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
  41. )
  42. @pytest.mark.skipif(
  43. os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
  44. )
  45. def test_s3_reader_writer_with_prefix():
  46. """test multi bucket s3 reader writer must config s3 config in the
  47. environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
  48. S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
  49. bucket = os.getenv('S3_BUCKET', '')
  50. ak = os.getenv('S3_ACCESS_KEY', '')
  51. sk = os.getenv('S3_SECRET_KEY', '')
  52. endpoint_url = os.getenv('S3_ENDPOINT', '')
  53. prefix = 'meta-index'
  54. reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
  55. writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
  56. bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
  57. assert bits == reader.read(
  58. f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
  59. )
  60. bits = reader.read(
  61. 'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
  62. )
  63. assert bits == reader.read_at(
  64. 'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
  65. )
  66. assert len(json.loads(bits)) > 0
  67. writer.write_string(
  68. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
  69. )
  70. assert 'abc'.encode() == reader.read(
  71. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  72. )
  73. assert 'abc'.encode() == reader.read(
  74. f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
  75. )
  76. writer.write(
  77. f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
  78. '123'.encode(),
  79. )
  80. assert '123'.encode() == reader.read(
  81. 'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
  82. )