simple_ocr_test.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env python3
  2. """
  3. 简化版OCR验证脚本 - 获取原始分析内容
  4. """
  5. import os
  6. import base64
  7. import json
  8. from pathlib import Path
  9. from openai import OpenAI
  10. from dotenv import load_dotenv
  11. # 加载环境变量
  12. load_dotenv()
  13. def simple_ocr_verification():
  14. """简化的OCR验证,返回原始分析文本"""
  15. # 获取配置
  16. api_key = os.getenv("YUSYS_MULTIMODAL_API_KEY")
  17. api_base = os.getenv("YUSYS_MULTIMODAL_API_BASE")
  18. model_id = os.getenv("YUSYS_MULTIMODAL_ID")
  19. model_name = model_id.replace("openai/", "") if model_id else ""
  20. # 文件路径
  21. image_path = "工大照片-1.jpg"
  22. ocr_json_path = "demo_54fa7ad0_page_1.json"
  23. # 读取图片
  24. with open(image_path, "rb") as f:
  25. image_data = base64.b64encode(f.read()).decode('utf-8')
  26. # 读取OCR结果
  27. with open(ocr_json_path, "r", encoding='utf-8') as f:
  28. ocr_results = json.load(f)
  29. # 构建OCR文本摘要
  30. ocr_summary = f"OCR识别了{len(ocr_results)}个项目:\\n"
  31. for i, item in enumerate(ocr_results[:5], 1): # 只显示前5个
  32. bbox = item.get('bbox', [])
  33. text = item.get('text', '')[:50] # 限制文本长度
  34. ocr_summary += f"{i}. 位置{bbox} - 文本: {text}\\n"
  35. if len(ocr_results) > 5:
  36. ocr_summary += f"... 还有{len(ocr_results) - 5}个项目\\n"
  37. # 简化的提示词
  38. prompt = f"""请分析这张图片,并与OCR识别结果进行对比。
  39. {ocr_summary}
  40. 请详细描述:
  41. 1. 图片的内容是什么?
  42. 2. OCR识别的结果是否与图片内容匹配?
  43. 3. 你发现了哪些明显的错误或问题?
  44. 4. 图片与OCR结果是否匹配同一份文档?
  45. 请用中文详细回答,不需要JSON格式。"""
  46. # 构建消息
  47. messages = [
  48. {
  49. "role": "user",
  50. "content": [
  51. {"type": "text", "text": prompt},
  52. {
  53. "type": "image_url",
  54. "image_url": {
  55. "url": f"data:image/jpeg;base64,{image_data}"
  56. }
  57. }
  58. ]
  59. }
  60. ]
  61. # 调用API
  62. client = OpenAI(api_key=api_key, base_url=api_base)
  63. print("正在分析图片和OCR结果...")
  64. response = client.chat.completions.create(
  65. model=model_name,
  66. messages=messages, # type: ignore
  67. temperature=0.3,
  68. max_tokens=2048
  69. )
  70. analysis = response.choices[0].message.content or "未获取到分析结果"
  71. print("\\n=== VLM详细分析结果 ===")
  72. print(analysis)
  73. # 保存原始分析
  74. with open("raw_analysis.txt", "w", encoding="utf-8") as f:
  75. f.write(f"图片: {image_path}\\n")
  76. f.write(f"OCR文件: {ocr_json_path}\\n")
  77. f.write(f"模型: {model_name}\\n")
  78. f.write(f"OCR结果摘要:\\n{ocr_summary}\\n\\n")
  79. f.write("=== VLM分析 ===\\n")
  80. f.write(analysis)
  81. print(f"\\n原始分析已保存到: raw_analysis.txt")
  82. if __name__ == "__main__":
  83. simple_ocr_verification()