inference.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import json
  2. import io
  3. import base64
  4. import math
  5. from PIL import Image
  6. import requests
  7. from dots_ocr.utils.image_utils import PILimage_to_base64
  8. from openai import OpenAI
  9. import os
  10. def inference_with_vllm(
  11. image,
  12. prompt,
  13. ip="localhost",
  14. port=8000,
  15. temperature=0.1,
  16. top_p=0.9,
  17. max_completion_tokens=32768,
  18. model_name='model',
  19. ):
  20. addr = f"http://{ip}:{port}/v1"
  21. client = OpenAI(api_key="{}".format(os.environ.get("API_KEY", "0")), base_url=addr)
  22. messages = []
  23. messages.append(
  24. {
  25. "role": "user",
  26. "content": [
  27. {
  28. "type": "image_url",
  29. "image_url": {"url": PILimage_to_base64(image)},
  30. },
  31. {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"} # if no "<|img|><|imgpad|><|endofimg|>" here,vllm v1 will add "\n" here
  32. ],
  33. }
  34. )
  35. try:
  36. response = client.chat.completions.create(
  37. messages=messages,
  38. model=model_name,
  39. max_completion_tokens=max_completion_tokens,
  40. temperature=temperature,
  41. top_p=top_p)
  42. response = response.choices[0].message.content
  43. return response
  44. except requests.exceptions.RequestException as e:
  45. print(f"request error: {e}")
  46. return None