test_read_api.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import os
  2. import pytest
  3. from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
  4. from magic_pdf.data.read_api import (read_jsonl, read_local_images,
  5. read_local_pdfs)
  6. from magic_pdf.data.schemas import S3Config
  7. def test_read_local_pdfs():
  8. datasets = read_local_pdfs('tests/test_data/assets/pdfs')
  9. assert len(datasets) == 2
  10. assert len(datasets[0]) > 0
  11. assert len(datasets[1]) > 0
  12. assert datasets[0].get_page(0).get_page_info().w > 0
  13. assert datasets[0].get_page(0).get_page_info().h > 0
  14. def test_read_local_images():
  15. datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png'])
  16. assert len(datasets) == 2
  17. assert len(datasets[0]) == 1
  18. assert len(datasets[1]) == 1
  19. assert datasets[0].get_page(0).get_page_info().w > 0
  20. assert datasets[0].get_page(0).get_page_info().h > 0
  21. @pytest.mark.skipif(
  22. os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
  23. )
  24. def test_read_json():
  25. """test multi bucket s3 reader writer must config s3 config in the
  26. environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
  27. S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
  28. export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
  29. """
  30. bucket = os.getenv('S3_BUCKET', '')
  31. ak = os.getenv('S3_ACCESS_KEY', '')
  32. sk = os.getenv('S3_SECRET_KEY', '')
  33. endpoint_url = os.getenv('S3_ENDPOINT', '')
  34. bucket_2 = os.getenv('S3_BUCKET_2', '')
  35. ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
  36. sk_2 = os.getenv('S3_SECRET_KEY_2', '')
  37. endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
  38. s3configs = [
  39. S3Config(
  40. bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
  41. ),
  42. S3Config(
  43. bucket_name=bucket_2,
  44. access_key=ak_2,
  45. secret_key=sk_2,
  46. endpoint_url=endpoint_url_2,
  47. ),
  48. ]
  49. reader = MultiBucketS3DataReader(bucket, s3configs)
  50. datasets = read_jsonl(
  51. f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
  52. reader,
  53. )
  54. assert len(datasets) > 0
  55. assert len(datasets[0]) == 10
  56. datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader)
  57. assert len(datasets) == 1
  58. assert len(datasets[0]) == 10
  59. datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl')
  60. assert len(datasets) == 1
  61. assert len(datasets[0]) == 1