| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- import os
- import click
- from llama_index.core.schema import TextNode
- from llama_index.embeddings.dashscope import (DashScopeEmbedding,
- DashScopeTextEmbeddingModels,
- DashScopeTextEmbeddingType)
- from llama_index.vector_stores.elasticsearch import ElasticsearchStore
- from magic_pdf.integrations.rag.api import DataReader
- es_vec_store = ElasticsearchStore(
- index_name='rag_index',
- es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
- es_user=os.getenv('ES_USER', 'elastic'),
- es_password=os.getenv('ES_PASSWORD', 'llama_index'),
- )
- # Create embeddings
- # text_type=`document` to build index
- def embed_node(node):
- embedder = DashScopeEmbedding(
- model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
- text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
- )
- result_embeddings = embedder.get_text_embedding(node.text)
- node.embedding = result_embeddings
- return node
- @click.command()
- @click.option(
- '-p',
- '--path',
- 'path',
- type=click.Path(exists=True),
- required=True,
- help='local pdf filepath or directory',
- )
- def cli(path):
- output_dir = '/tmp/magic_pdf/integrations/rag/'
- os.makedirs(output_dir, exist_ok=True)
- documents = DataReader(path, 'ocr', output_dir)
- # build nodes
- nodes = []
- for idx in range(documents.get_documents_count()):
- doc = documents.get_document_result(idx)
- if doc is None: # something wrong happens when parse pdf !
- continue
- for page in iter(
- doc): # iterate documents from initial page to last page !
- for element in iter(page): # iterate the element from all page !
- if element.text is None:
- continue
- nodes.append(
- embed_node(
- TextNode(text=element.text,
- metadata={'purpose': 'demo'})))
- es_vec_store.add(nodes)
- if __name__ == '__main__':
- cli()
|