inference.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import requests
  2. from dots_ocr.utils.image_utils import PILimage_to_base64
  3. from openai import OpenAI
  4. import os
  5. def inference_with_vllm(
  6. image,
  7. prompt,
  8. ip="localhost",
  9. port=8000,
  10. temperature=0.1,
  11. top_p=0.9,
  12. max_completion_tokens=32768,
  13. model_name='model',
  14. ):
  15. addr = f"http://{ip}:{port}/v1"
  16. client = OpenAI(api_key="{}".format(os.environ.get("API_KEY", "0")), base_url=addr)
  17. messages = []
  18. messages.append(
  19. {
  20. "role": "user",
  21. "content": [
  22. {
  23. "type": "image_url",
  24. "image_url": {"url": PILimage_to_base64(image)},
  25. },
  26. {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"} # if no "<|img|><|imgpad|><|endofimg|>" here,vllm v1 will add "\n" here
  27. ],
  28. }
  29. )
  30. try:
  31. response = client.chat.completions.create(
  32. messages=messages,
  33. model=model_name,
  34. max_completion_tokens=max_completion_tokens,
  35. temperature=temperature,
  36. top_p=top_p)
  37. response = response.choices[0].message.content
  38. return response
  39. except requests.exceptions.RequestException as e:
  40. print(f"request error: {e}")
  41. return None