test_pp_doctranslation.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from paddlex import create_pipeline
  15. pipeline = create_pipeline(pipeline="PP-DocTranslation")
  16. input_path = "document_sample.pdf"
  17. output_path = "./output"
  18. chat_bot_config = {
  19. "module_name": "chat_bot",
  20. "model_name": "ernie-3.5-8k",
  21. "base_url": "https://qianfan.baidubce.com/v2",
  22. "api_type": "openai",
  23. "api_key": "api_key", # your api_key
  24. }
  25. if input_path.lower().endswith(".md"):
  26. ori_md_info_list = pipeline.load_from_markdown(input_path)
  27. else:
  28. # Use PP-StructureV3 to get original markdown info list
  29. visual_predict_res = pipeline.visual_predict(
  30. input_path,
  31. use_doc_orientation_classify=False,
  32. use_doc_unwarping=False,
  33. use_common_ocr=True,
  34. use_seal_recognition=True,
  35. use_table_recognition=True,
  36. )
  37. ori_md_info_list = []
  38. for res in visual_predict_res:
  39. layout_parsing_result = res["layout_parsing_result"]
  40. ori_md_info_list.append(layout_parsing_result.markdown)
  41. layout_parsing_result.save_to_img(output_path)
  42. layout_parsing_result.save_to_markdown(output_path)
  43. # To concatenate markdown pages into a single markdown file, when input is a pdf file
  44. if input_path.lower().endswith(".pdf"):
  45. ori_md_info = pipeline.concatenate_markdown_pages(ori_md_info_list)
  46. ori_md_info.save_to_markdown(output_path)
  47. tgt_md_info_list = pipeline.translate(
  48. ori_md_info_list=ori_md_info_list,
  49. target_language="en",
  50. chunk_size=3000,
  51. chat_bot_config=chat_bot_config,
  52. )
  53. for tgt_md_info in tgt_md_info_list:
  54. tgt_md_info.save_to_markdown(output_path)