pipeline.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import queue
  15. import threading
  16. import time
  17. from itertools import chain
  18. from typing import Any, Dict, Optional, Tuple, Union
  19. import numpy as np
  20. from PIL import Image
  21. from ....utils import logging
  22. from ....utils.deps import pipeline_requires_extra
  23. from ...common.batch_sampler import ImageBatchSampler
  24. from ...common.reader import ReadImage
  25. from ...utils.benchmark import benchmark
  26. from ...utils.hpi import HPIConfig
  27. from ...utils.pp_option import PaddlePredictorOption
  28. from .._parallel import AutoParallelImageSimpleInferencePipeline
  29. from ..base import BasePipeline
  30. from ..components import CropByBoxes
  31. from ..layout_parsing.utils import gather_imgs
  32. from .result import PaddleOCRVLBlock, PaddleOCRVLResult
  33. from .uilts import (
  34. convert_otsl_to_html,
  35. crop_margin,
  36. filter_overlap_boxes,
  37. merge_blocks,
  38. tokenize_figure_of_table,
  39. truncate_repetitive_content,
  40. untokenize_figure_of_table,
  41. )
  42. IMAGE_LABELS = ["image", "header_image", "footer_image", "seal"]
  43. @benchmark.time_methods
  44. class _PaddleOCRVLPipeline(BasePipeline):
  45. """_PaddleOCRVLPipeline Pipeline"""
  46. def __init__(
  47. self,
  48. config: Dict,
  49. device: Optional[str] = None,
  50. pp_option: Optional[PaddlePredictorOption] = None,
  51. use_hpip: bool = False,
  52. hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
  53. ) -> None:
  54. """
  55. Initializes the class with given configurations and options.
  56. Args:
  57. config (Dict): Configuration dictionary containing various settings.
  58. device (str, optional): Device to run the predictions on. Defaults to None.
  59. pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
  60. use_hpip (bool, optional): Whether to use the high-performance
  61. inference plugin (HPIP) by default. Defaults to False.
  62. hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
  63. The default high-performance inference configuration dictionary.
  64. Defaults to None.
  65. """
  66. super().__init__(
  67. device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
  68. )
  69. self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
  70. if self.use_doc_preprocessor:
  71. doc_preprocessor_config = config.get("SubPipelines", {}).get(
  72. "DocPreprocessor",
  73. {
  74. "pipeline_config_error": "config error for doc_preprocessor_pipeline!"
  75. },
  76. )
  77. self.doc_preprocessor_pipeline = self.create_pipeline(
  78. doc_preprocessor_config
  79. )
  80. self.use_layout_detection = config.get("use_layout_detection", True)
  81. if self.use_layout_detection:
  82. layout_det_config = config.get("SubModules", {}).get(
  83. "LayoutDetection",
  84. {"model_config_error": "config error for layout_det_model!"},
  85. )
  86. model_name = layout_det_config.get("model_name", None)
  87. # assert (
  88. # model_name is not None and model_name == "PP-DocLayoutV2"
  89. # ), "model_name must be PP-DocLayoutV2"
  90. layout_kwargs = {}
  91. if (threshold := layout_det_config.get("threshold", None)) is not None:
  92. layout_kwargs["threshold"] = threshold
  93. if (layout_nms := layout_det_config.get("layout_nms", None)) is not None:
  94. layout_kwargs["layout_nms"] = layout_nms
  95. if (
  96. layout_unclip_ratio := layout_det_config.get(
  97. "layout_unclip_ratio", None
  98. )
  99. ) is not None:
  100. layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
  101. if (
  102. layout_merge_bboxes_mode := layout_det_config.get(
  103. "layout_merge_bboxes_mode", None
  104. )
  105. ) is not None:
  106. layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
  107. self.layout_det_model = self.create_model(
  108. layout_det_config, **layout_kwargs
  109. )
  110. self.use_chart_recognition = config.get("use_chart_recognition", True)
  111. vl_rec_config = config.get("SubModules", {}).get(
  112. "VLRecognition",
  113. {"model_config_error": "config error for vl_rec_model!"},
  114. )
  115. self.vl_rec_model = self.create_model(vl_rec_config)
  116. self.format_block_content = config.get("format_block_content", False)
  117. self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
  118. self.img_reader = ReadImage(format="BGR")
  119. self.crop_by_boxes = CropByBoxes()
  120. self.use_queues = config.get("use_queues", False)
  121. def close(self):
  122. self.vl_rec_model.close()
  123. def get_model_settings(
  124. self,
  125. use_doc_orientation_classify: Union[bool, None],
  126. use_doc_unwarping: Union[bool, None],
  127. use_layout_detection: Union[bool, None],
  128. use_chart_recognition: Union[bool, None],
  129. format_block_content: Union[bool, None],
  130. ) -> dict:
  131. """
  132. Get the model settings based on the provided parameters or default values.
  133. Args:
  134. use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
  135. use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
  136. Returns:
  137. dict: A dictionary containing the model settings.
  138. """
  139. if use_doc_orientation_classify is None and use_doc_unwarping is None:
  140. use_doc_preprocessor = self.use_doc_preprocessor
  141. else:
  142. if use_doc_orientation_classify is True or use_doc_unwarping is True:
  143. use_doc_preprocessor = True
  144. else:
  145. use_doc_preprocessor = False
  146. if use_layout_detection is None:
  147. use_layout_detection = self.use_layout_detection
  148. if use_chart_recognition is None:
  149. use_chart_recognition = self.use_chart_recognition
  150. if format_block_content is None:
  151. format_block_content = self.format_block_content
  152. return dict(
  153. use_doc_preprocessor=use_doc_preprocessor,
  154. use_layout_detection=use_layout_detection,
  155. use_chart_recognition=use_chart_recognition,
  156. format_block_content=format_block_content,
  157. )
  158. def check_model_settings_valid(self, input_params: dict) -> bool:
  159. """
  160. Check if the input parameters are valid based on the initialized models.
  161. Args:
  162. input_params (Dict): A dictionary containing input parameters.
  163. Returns:
  164. bool: True if all required models are initialized according to input parameters, False otherwise.
  165. """
  166. if input_params["use_doc_preprocessor"] and not self.use_doc_preprocessor:
  167. logging.error(
  168. "Set use_doc_preprocessor, but the models for doc preprocessor are not initialized.",
  169. )
  170. return False
  171. return True
  172. def get_layout_parsing_results(
  173. self,
  174. images,
  175. layout_det_results,
  176. imgs_in_doc,
  177. use_chart_recognition=False,
  178. vlm_kwargs=None,
  179. ):
  180. blocks = []
  181. block_imgs = []
  182. text_prompts = []
  183. vlm_block_ids = []
  184. figure_token_maps = []
  185. drop_figures_set = set()
  186. image_labels = (
  187. IMAGE_LABELS if use_chart_recognition else IMAGE_LABELS + ["chart"]
  188. )
  189. for i, (image, layout_det_res, imgs_in_doc_for_img) in enumerate(
  190. zip(images, layout_det_results, imgs_in_doc)
  191. ):
  192. layout_det_res = filter_overlap_boxes(layout_det_res)
  193. boxes = layout_det_res["boxes"]
  194. blocks_for_img = self.crop_by_boxes(image, boxes)
  195. blocks_for_img = merge_blocks(
  196. blocks_for_img, non_merge_labels=image_labels + ["table"]
  197. )
  198. blocks.append(blocks_for_img)
  199. for j, block in enumerate(blocks_for_img):
  200. block_img = block["img"]
  201. block_label = block["label"]
  202. if block_label not in image_labels and block_img is not None:
  203. figure_token_map = {}
  204. text_prompt = "OCR:"
  205. drop_figures = []
  206. if block_label == "table":
  207. text_prompt = "Table Recognition:"
  208. block_img, figure_token_map, drop_figures = (
  209. tokenize_figure_of_table(
  210. block_img, block["box"], imgs_in_doc_for_img
  211. )
  212. )
  213. elif block_label == "chart" and use_chart_recognition:
  214. text_prompt = "Chart Recognition:"
  215. elif "formula" in block_label and block_label != "formula_number":
  216. text_prompt = "Formula Recognition:"
  217. block_img = crop_margin(block_img)
  218. block_imgs.append(block_img)
  219. text_prompts.append(text_prompt)
  220. figure_token_maps.append(figure_token_map)
  221. vlm_block_ids.append((i, j))
  222. drop_figures_set.update(drop_figures)
  223. if vlm_kwargs is None:
  224. vlm_kwargs = {}
  225. elif vlm_kwargs.get("max_new_tokens", None) is None:
  226. vlm_kwargs["max_new_tokens"] = 4096
  227. kwargs = {
  228. "use_cache": True,
  229. **vlm_kwargs,
  230. }
  231. vl_rec_results = list(
  232. self.vl_rec_model.predict(
  233. [
  234. {
  235. "image": block_img,
  236. "query": text_prompt,
  237. }
  238. for block_img, text_prompt in zip(block_imgs, text_prompts)
  239. ],
  240. skip_special_tokens=True,
  241. **kwargs,
  242. )
  243. )
  244. parsing_res_lists = []
  245. table_res_lists = []
  246. curr_vlm_block_idx = 0
  247. for i, blocks_for_img in enumerate(blocks):
  248. parsing_res_list = []
  249. table_res_list = []
  250. for j, block in enumerate(blocks_for_img):
  251. block_img = block["img"]
  252. block_bbox = block["box"]
  253. block_label = block["label"]
  254. block_content = ""
  255. if curr_vlm_block_idx < len(vlm_block_ids) and vlm_block_ids[
  256. curr_vlm_block_idx
  257. ] == (i, j):
  258. vl_rec_result = vl_rec_results[curr_vlm_block_idx]
  259. figure_token_map = figure_token_maps[curr_vlm_block_idx]
  260. block_img4vl = block_imgs[curr_vlm_block_idx]
  261. curr_vlm_block_idx += 1
  262. vl_rec_result["image"] = block_img4vl
  263. result_str = vl_rec_result.get("result", "")
  264. if result_str is None:
  265. result_str = ""
  266. result_str = truncate_repetitive_content(result_str)
  267. if ("\\(" in result_str and "\\)" in result_str) or (
  268. "\\[" in result_str and "\\]" in result_str
  269. ):
  270. result_str = result_str.replace("$", "")
  271. result_str = (
  272. result_str.replace("\(", " $ ")
  273. .replace("\\)", " $ ")
  274. .replace("\\[", " $$ ")
  275. .replace("\\]", " $$ ")
  276. )
  277. if block_label == "formula_number":
  278. result_str = result_str.replace("$", "")
  279. if block_label == "table":
  280. html_str = convert_otsl_to_html(result_str)
  281. if html_str != "":
  282. result_str = html_str
  283. result_str = untokenize_figure_of_table(
  284. result_str, figure_token_map
  285. )
  286. block_content = result_str
  287. block_info = PaddleOCRVLBlock(
  288. label=block_label,
  289. bbox=block_bbox,
  290. content=block_content,
  291. )
  292. if block_label in image_labels and block_img is not None:
  293. x_min, y_min, x_max, y_max = list(map(int, block_bbox))
  294. img_path = f"imgs/img_in_{block_label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
  295. if img_path not in drop_figures_set:
  296. import cv2
  297. block_img = cv2.cvtColor(block_img, cv2.COLOR_BGR2RGB)
  298. block_info.image = {
  299. "path": img_path,
  300. "img": Image.fromarray(block_img),
  301. }
  302. else:
  303. continue
  304. parsing_res_list.append(block_info)
  305. parsing_res_lists.append(parsing_res_list)
  306. table_res_lists.append(table_res_list)
  307. return parsing_res_lists, table_res_lists, imgs_in_doc
  308. def predict(
  309. self,
  310. input: Union[str, list[str], np.ndarray, list[np.ndarray]],
  311. use_doc_orientation_classify: Union[bool, None] = False,
  312. use_doc_unwarping: Union[bool, None] = False,
  313. use_layout_detection: Union[bool, None] = None,
  314. use_chart_recognition: Union[bool, None] = None,
  315. layout_threshold: Optional[Union[float, dict]] = None,
  316. layout_nms: Optional[bool] = None,
  317. layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
  318. layout_merge_bboxes_mode: Optional[str] = None,
  319. use_queues: Optional[bool] = None,
  320. prompt_label: Optional[Union[str, None]] = None,
  321. format_block_content: Union[bool, None] = None,
  322. repetition_penalty: Optional[float] = None,
  323. temperature: Optional[float] = None,
  324. top_p: Optional[float] = None,
  325. min_pixels: Optional[int] = None,
  326. max_pixels: Optional[int] = None,
  327. max_new_tokens: Optional[int] = None,
  328. **kwargs,
  329. ) -> PaddleOCRVLResult:
  330. """
  331. Predicts the layout parsing result for the given input.
  332. Args:
  333. input (Union[str, list[str], np.ndarray, list[np.ndarray]]): Input image path, list of image paths,
  334. numpy array of an image, or list of numpy arrays.
  335. use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
  336. use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
  337. layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
  338. layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
  339. layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
  340. Defaults to None.
  341. If it's a single number, then both width and height are used.
  342. If it's a tuple of two numbers, then they are used separately for width and height respectively.
  343. If it's None, then no unclipping will be performed.
  344. layout_merge_bboxes_mode (Optional[str], optional): The mode for merging bounding boxes. Defaults to None.
  345. use_queues (Optional[bool], optional): Whether to use queues. Defaults to None.
  346. prompt_label (Optional[Union[str, None]], optional): The label of the prompt in ['ocr', 'formula', 'table', 'chart']. Defaults to None.
  347. format_block_content (Optional[bool]): Whether to format the block content. Default is None.
  348. repetition_penalty (Optional[float]): The repetition penalty parameter used for VL model sampling. Default is None.
  349. temperature (Optional[float]): Temperature parameter used for VL model sampling. Default is None.
  350. top_p (Optional[float]): Top-p parameter used for VL model sampling. Default is None.
  351. min_pixels (Optional[int]): The minimum number of pixels allowed when the VL model preprocesses images. Default is None.
  352. max_pixels (Optional[int]): The maximum number of pixels allowed when the VL model preprocesses images. Default is None.
  353. max_new_tokens (Optional[int]): The maximum number of new tokens. Default is None.
  354. **kwargs (Any): Additional settings to extend functionality.
  355. Returns:
  356. PaddleOCRVLResult: The predicted layout parsing result.
  357. """
  358. model_settings = self.get_model_settings(
  359. use_doc_orientation_classify,
  360. use_doc_unwarping,
  361. use_layout_detection,
  362. use_chart_recognition,
  363. format_block_content,
  364. )
  365. if not self.check_model_settings_valid(model_settings):
  366. yield {"error": "the input params for model settings are invalid!"}
  367. if use_queues is None:
  368. use_queues = self.use_queues
  369. if not model_settings["use_layout_detection"]:
  370. prompt_label = prompt_label if prompt_label else "ocr"
  371. if prompt_label.lower() == "chart":
  372. model_settings["use_chart_recognition"] = True
  373. assert prompt_label.lower() in [
  374. "ocr",
  375. "formula",
  376. "table",
  377. "chart",
  378. ], f"Layout detection is disabled (use_layout_detection=False). 'prompt_label' must be one of ['ocr', 'formula', 'table', 'chart'], but got '{prompt_label}'."
  379. def _process_cv(batch_data, new_batch_size=None):
  380. if not new_batch_size:
  381. new_batch_size = len(batch_data)
  382. for idx in range(0, len(batch_data), new_batch_size):
  383. instances = batch_data.instances[idx : idx + new_batch_size]
  384. input_paths = batch_data.input_paths[idx : idx + new_batch_size]
  385. page_indexes = batch_data.page_indexes[idx : idx + new_batch_size]
  386. image_arrays = self.img_reader(instances)
  387. if model_settings["use_doc_preprocessor"]:
  388. doc_preprocessor_results = list(
  389. self.doc_preprocessor_pipeline(
  390. image_arrays,
  391. use_doc_orientation_classify=use_doc_orientation_classify,
  392. use_doc_unwarping=use_doc_unwarping,
  393. )
  394. )
  395. else:
  396. doc_preprocessor_results = [
  397. {"output_img": arr} for arr in image_arrays
  398. ]
  399. doc_preprocessor_images = [
  400. item["output_img"] for item in doc_preprocessor_results
  401. ]
  402. if model_settings["use_layout_detection"]:
  403. layout_det_results = list(
  404. self.layout_det_model(
  405. doc_preprocessor_images,
  406. threshold=layout_threshold,
  407. layout_nms=layout_nms,
  408. layout_unclip_ratio=layout_unclip_ratio,
  409. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  410. )
  411. )
  412. imgs_in_doc = [
  413. gather_imgs(doc_pp_img, layout_det_res["boxes"])
  414. for doc_pp_img, layout_det_res in zip(
  415. doc_preprocessor_images, layout_det_results
  416. )
  417. ]
  418. else:
  419. layout_det_results = []
  420. for doc_preprocessor_image in doc_preprocessor_images:
  421. layout_det_results.append(
  422. {
  423. "input_path": None,
  424. "page_index": None,
  425. "boxes": [
  426. {
  427. "cls_id": 0,
  428. "label": prompt_label.lower(),
  429. "score": 1,
  430. "coordinate": [
  431. 0,
  432. 0,
  433. doc_preprocessor_image.shape[1],
  434. doc_preprocessor_image.shape[0],
  435. ],
  436. }
  437. ],
  438. }
  439. )
  440. imgs_in_doc = [[] for _ in layout_det_results]
  441. yield input_paths, page_indexes, doc_preprocessor_images, doc_preprocessor_results, layout_det_results, imgs_in_doc
  442. def _process_vlm(results_cv):
  443. (
  444. input_paths,
  445. page_indexes,
  446. doc_preprocessor_images,
  447. doc_preprocessor_results,
  448. layout_det_results,
  449. imgs_in_doc,
  450. ) = results_cv
  451. parsing_res_lists, table_res_lists, imgs_in_doc = (
  452. self.get_layout_parsing_results(
  453. doc_preprocessor_images,
  454. layout_det_results,
  455. imgs_in_doc,
  456. model_settings["use_chart_recognition"],
  457. {
  458. "repetition_penalty": repetition_penalty,
  459. "temperature": temperature,
  460. "top_p": top_p,
  461. "min_pixels": min_pixels,
  462. "max_pixels": max_pixels,
  463. "max_new_tokens": max_new_tokens,
  464. },
  465. )
  466. )
  467. for (
  468. input_path,
  469. page_index,
  470. doc_preprocessor_image,
  471. doc_preprocessor_res,
  472. layout_det_res,
  473. table_res_list,
  474. parsing_res_list,
  475. imgs_in_doc_for_img,
  476. ) in zip(
  477. input_paths,
  478. page_indexes,
  479. doc_preprocessor_images,
  480. doc_preprocessor_results,
  481. layout_det_results,
  482. table_res_lists,
  483. parsing_res_lists,
  484. imgs_in_doc,
  485. ):
  486. single_img_res = {
  487. "input_path": input_path,
  488. "page_index": page_index,
  489. "doc_preprocessor_res": doc_preprocessor_res,
  490. "layout_det_res": layout_det_res,
  491. "table_res_list": table_res_list,
  492. "parsing_res_list": parsing_res_list,
  493. "imgs_in_doc": imgs_in_doc_for_img,
  494. "model_settings": model_settings,
  495. }
  496. yield PaddleOCRVLResult(single_img_res)
  497. if use_queues:
  498. max_num_batches_in_process = 64
  499. queue_input = queue.Queue(maxsize=max_num_batches_in_process)
  500. queue_cv = queue.Queue(maxsize=max_num_batches_in_process)
  501. queue_vlm = queue.Queue(
  502. maxsize=self.batch_sampler.batch_size * max_num_batches_in_process
  503. )
  504. event_shutdown = threading.Event()
  505. event_data_loading_done = threading.Event()
  506. event_cv_processing_done = threading.Event()
  507. event_vlm_processing_done = threading.Event()
  508. def _worker_input(input_):
  509. all_batch_data = self.batch_sampler(input_)
  510. while not event_shutdown.is_set():
  511. try:
  512. batch_data = next(all_batch_data)
  513. except StopIteration:
  514. break
  515. except Exception as e:
  516. queue_input.put((False, "input", e))
  517. break
  518. else:
  519. queue_input.put((True, batch_data))
  520. event_data_loading_done.set()
  521. def _worker_cv():
  522. while not event_shutdown.is_set():
  523. try:
  524. item = queue_input.get(timeout=0.5)
  525. except queue.Empty:
  526. if event_data_loading_done.is_set():
  527. event_cv_processing_done.set()
  528. break
  529. continue
  530. if not item[0]:
  531. queue_cv.put(item)
  532. break
  533. try:
  534. for results_cv in _process_cv(
  535. item[1],
  536. (
  537. self.layout_det_model.batch_sampler.batch_size
  538. if model_settings["use_layout_detection"]
  539. else None
  540. ),
  541. ):
  542. queue_cv.put((True, results_cv))
  543. except Exception as e:
  544. queue_cv.put((False, "cv", e))
  545. break
  546. def _worker_vlm():
  547. MAX_QUEUE_DELAY_SECS = 0.5
  548. MAX_NUM_BOXES = self.vl_rec_model.batch_sampler.batch_size
  549. while not event_shutdown.is_set():
  550. results_cv_list = []
  551. start_time = time.time()
  552. should_break = False
  553. num_boxes = 0
  554. while True:
  555. remaining_time = MAX_QUEUE_DELAY_SECS - (
  556. time.time() - start_time
  557. )
  558. if remaining_time <= 0:
  559. break
  560. try:
  561. item = queue_cv.get(timeout=remaining_time)
  562. except queue.Empty:
  563. break
  564. if not item[0]:
  565. queue_vlm.put(item)
  566. should_break = True
  567. break
  568. results_cv_list.append(item[1])
  569. for res in results_cv_list[-1][4]:
  570. num_boxes += len(res["boxes"])
  571. if num_boxes >= MAX_NUM_BOXES:
  572. break
  573. if should_break:
  574. break
  575. if not results_cv_list:
  576. if event_cv_processing_done.is_set():
  577. event_vlm_processing_done.set()
  578. break
  579. continue
  580. merged_results_cv = [
  581. list(chain.from_iterable(lists))
  582. for lists in zip(*results_cv_list)
  583. ]
  584. try:
  585. for result_vlm in _process_vlm(merged_results_cv):
  586. queue_vlm.put((True, result_vlm))
  587. except Exception as e:
  588. queue_vlm.put((False, "vlm", e))
  589. break
  590. thread_input = threading.Thread(
  591. target=_worker_input, args=(input,), daemon=False
  592. )
  593. thread_input.start()
  594. thread_cv = threading.Thread(target=_worker_cv, daemon=False)
  595. thread_cv.start()
  596. thread_vlm = threading.Thread(target=_worker_vlm, daemon=False)
  597. thread_vlm.start()
  598. try:
  599. if use_queues:
  600. while not (event_vlm_processing_done.is_set() and queue_vlm.empty()):
  601. try:
  602. item = queue_vlm.get(timeout=0.5)
  603. except queue.Empty:
  604. if event_vlm_processing_done.is_set():
  605. break
  606. continue
  607. if not item[0]:
  608. raise RuntimeError(
  609. f"Exception from the '{item[1]}' worker: {item[2]}"
  610. )
  611. else:
  612. yield item[1]
  613. else:
  614. for batch_data in self.batch_sampler(input):
  615. results_cv_list = list(_process_cv(batch_data))
  616. assert len(results_cv_list) == 1, len(results_cv_list)
  617. results_cv = results_cv_list[0]
  618. for res in _process_vlm(results_cv):
  619. yield res
  620. finally:
  621. if use_queues:
  622. event_shutdown.set()
  623. thread_cv.join(timeout=5)
  624. if thread_cv.is_alive():
  625. logging.warning("CV worker did not terminate in time")
  626. thread_vlm.join(timeout=5)
  627. if thread_vlm.is_alive():
  628. logging.warning("VLM worker did not terminate in time")
  629. def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
  630. """
  631. Concatenate Markdown content from multiple pages into a single document.
  632. Args:
  633. markdown_list (list): A list containing Markdown data for each page.
  634. Returns:
  635. tuple: A tuple containing the processed Markdown text.
  636. """
  637. markdown_texts = ""
  638. for res in markdown_list:
  639. markdown_texts += "\n\n" + res["markdown_texts"]
  640. return markdown_texts
  641. @pipeline_requires_extra("ocr")
  642. class PaddleOCRVLPipeline(AutoParallelImageSimpleInferencePipeline):
  643. entities = "PaddleOCR-VL"
  644. @property
  645. def _pipeline_cls(self):
  646. return _PaddleOCRVLPipeline
  647. def _get_batch_size(self, config):
  648. return config.get("batch_size", 1)