data_ingestion.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. import os
  2. import click
  3. from llama_index.core.schema import TextNode
  4. from llama_index.embeddings.dashscope import (DashScopeEmbedding,
  5. DashScopeTextEmbeddingModels,
  6. DashScopeTextEmbeddingType)
  7. from llama_index.vector_stores.elasticsearch import ElasticsearchStore
  8. from magic_pdf.integrations.rag.api import DataReader
  9. es_vec_store = ElasticsearchStore(
  10. index_name='rag_index',
  11. es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
  12. es_user=os.getenv('ES_USER', 'elastic'),
  13. es_password=os.getenv('ES_PASSWORD', 'llama_index'),
  14. )
  15. # Create embeddings
  16. # text_type=`document` to build index
  17. def embed_node(node):
  18. embedder = DashScopeEmbedding(
  19. model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
  20. text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
  21. )
  22. result_embeddings = embedder.get_text_embedding(node.text)
  23. node.embedding = result_embeddings
  24. return node
  25. @click.command()
  26. @click.option(
  27. '-p',
  28. '--path',
  29. 'path',
  30. type=click.Path(exists=True),
  31. required=True,
  32. help='local pdf filepath or directory',
  33. )
  34. def cli(path):
  35. output_dir = '/tmp/magic_pdf/integrations/rag/'
  36. os.makedirs(output_dir, exist_ok=True)
  37. documents = DataReader(path, 'ocr', output_dir)
  38. # build nodes
  39. nodes = []
  40. for idx in range(documents.get_documents_count()):
  41. doc = documents.get_document_result(idx)
  42. if doc is None: # something wrong happens when parse pdf !
  43. continue
  44. for page in iter(
  45. doc): # iterate documents from initial page to last page !
  46. for element in iter(page): # iterate the element from all page !
  47. if element.text is None:
  48. continue
  49. nodes.append(
  50. embed_node(
  51. TextNode(text=element.text,
  52. metadata={'purpose': 'demo'})))
  53. es_vec_store.add(nodes)
  54. if __name__ == '__main__':
  55. cli()