| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- import json
- import os
- import shutil
- import tempfile
- from magic_pdf.integrations.rag.api import DataReader, RagDocumentReader
- from magic_pdf.integrations.rag.type import CategoryType
- from magic_pdf.integrations.rag.utils import \
- convert_middle_json_to_layout_elements
- def test_rag_document_reader():
- # setup
- unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
- os.makedirs(unitest_dir, exist_ok=True)
- temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
- os.makedirs(temp_output_dir, exist_ok=True)
- # test
- with open('tests/unittest/test_integrations/test_rag/assets/middle.json') as f:
- json_data = json.load(f)
- res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
- doc = RagDocumentReader(res)
- assert len(list(iter(doc))) == 1
- page = list(iter(doc))[0]
- assert len(list(iter(page))) == 10
- assert len(page.get_rel_map()) == 3
- item = list(iter(page))[0]
- assert item.category_type == CategoryType.text
- # teardown
- shutil.rmtree(temp_output_dir)
- def test_data_reader():
- # setup
- unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
- os.makedirs(unitest_dir, exist_ok=True)
- temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
- os.makedirs(temp_output_dir, exist_ok=True)
- # test
- data_reader = DataReader('tests/unittest/test_integrations/test_rag/assets', 'ocr',
- temp_output_dir)
- assert data_reader.get_documents_count() == 2
- for idx in range(data_reader.get_documents_count()):
- document = data_reader.get_document_result(idx)
- assert document is not None
- # teardown
- shutil.rmtree(temp_output_dir)
|