layout_parsing.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from typing import Final, List, Literal, Optional, Tuple
  16. import cv2
  17. import numpy as np
  18. from fastapi import FastAPI, HTTPException
  19. from numpy.typing import ArrayLike
  20. from pydantic import BaseModel, Field
  21. from typing_extensions import Annotated, TypeAlias
  22. from .....utils import logging
  23. from ...layout_parsing import LayoutParsingPipeline
  24. from .. import file_storage
  25. from .. import utils as serving_utils
  26. from ..app import AppConfig, create_app
  27. from ..models import Response, ResultResponse
  28. _DEFAULT_MAX_IMG_SIZE: Final[Tuple[int, int]] = (2000, 2000)
  29. _DEFAULT_MAX_NUM_IMGS: Final[int] = 10
  30. FileType: TypeAlias = Literal[0, 1]
  31. class InferenceParams(BaseModel):
  32. maxLongSide: Optional[Annotated[int, Field(gt=0)]] = None
  33. class InferRequest(BaseModel):
  34. file: str
  35. fileType: Optional[FileType] = None
  36. useImgOrientationCls: bool = True
  37. useImgUnwrapping: bool = True
  38. useSealTextDet: bool = True
  39. inferenceParams: Optional[InferenceParams] = None
  40. BoundingBox: TypeAlias = Annotated[List[float], Field(min_length=4, max_length=4)]
  41. class LayoutElement(BaseModel):
  42. bbox: BoundingBox
  43. label: str
  44. text: str
  45. layoutType: Literal["single", "double"]
  46. image: Optional[str] = None
  47. class LayoutParsingResult(BaseModel):
  48. layoutElements: List[LayoutElement]
  49. class InferResult(BaseModel):
  50. layoutParsingResults: List[LayoutParsingResult]
  51. def _postprocess_image(
  52. img: ArrayLike,
  53. request_id: str,
  54. filename: str,
  55. file_storage_config: file_storage.FileStorageConfig,
  56. ) -> str:
  57. key = f"{request_id}/{filename}"
  58. ext = os.path.splitext(filename)[1]
  59. img = np.asarray(img)
  60. _, encoded_img = cv2.imencode(ext, img)
  61. encoded_img = encoded_img.tobytes()
  62. return file_storage.postprocess_file(
  63. encoded_img, config=file_storage_config, key=key
  64. )
  65. def create_pipeline_app(
  66. pipeline: LayoutParsingPipeline, app_config: AppConfig
  67. ) -> FastAPI:
  68. app, ctx = create_app(
  69. pipeline=pipeline, app_config=app_config, app_aiohttp_session=True
  70. )
  71. if "file_storage_config" in ctx.extra:
  72. ctx.extra["file_storage_config"] = file_storage.parse_file_storage_config(
  73. ctx.extra["file_storage_config"]
  74. )
  75. else:
  76. ctx.extra["file_storage_config"] = file_storage.InMemoryStorageConfig()
  77. ctx.extra.setdefault("max_img_size", _DEFAULT_MAX_IMG_SIZE)
  78. ctx.extra.setdefault("max_num_imgs", _DEFAULT_MAX_NUM_IMGS)
  79. @app.post(
  80. "/layout-parsing",
  81. operation_id="infer",
  82. responses={422: {"model": Response}},
  83. response_model_exclude_none=True,
  84. )
  85. async def _infer(
  86. request: InferRequest,
  87. ) -> ResultResponse[InferResult]:
  88. pipeline = ctx.pipeline
  89. aiohttp_session = ctx.aiohttp_session
  90. request_id = serving_utils.generate_request_id()
  91. if request.fileType is None:
  92. if serving_utils.is_url(request.file):
  93. try:
  94. file_type = serving_utils.infer_file_type(request.file)
  95. except Exception as e:
  96. logging.exception(e)
  97. raise HTTPException(
  98. status_code=422,
  99. detail="The file type cannot be inferred from the URL. Please specify the file type explicitly.",
  100. )
  101. else:
  102. raise HTTPException(status_code=422, detail="Unknown file type")
  103. else:
  104. file_type = "PDF" if request.fileType == 0 else "IMAGE"
  105. if request.inferenceParams:
  106. max_long_side = request.inferenceParams.maxLongSide
  107. if max_long_side:
  108. raise HTTPException(
  109. status_code=422,
  110. detail="`max_long_side` is currently not supported.",
  111. )
  112. try:
  113. file_bytes = await serving_utils.get_raw_bytes(
  114. request.file, aiohttp_session
  115. )
  116. images = await serving_utils.call_async(
  117. serving_utils.file_to_images,
  118. file_bytes,
  119. file_type,
  120. max_img_size=ctx.extra["max_img_size"],
  121. max_num_imgs=ctx.extra["max_num_imgs"],
  122. )
  123. result = await pipeline.infer(
  124. images,
  125. use_doc_image_ori_cls_model=request.useImgOrientationCls,
  126. use_doc_image_unwarp_model=request.useImgUnwrapping,
  127. use_seal_text_det_model=request.useSealTextDet,
  128. )
  129. layout_parsing_results: List[LayoutParsingResult] = []
  130. for i, item in enumerate(result):
  131. layout_elements: List[LayoutElement] = []
  132. for j, subitem in enumerate(
  133. item["layout_parsing_result"]["parsing_result"]
  134. ):
  135. dyn_keys = subitem.keys() - {"input_path", "layout_bbox", "layout"}
  136. if len(dyn_keys) != 1:
  137. raise RuntimeError(f"Unexpected result: {subitem}")
  138. label = next(iter(dyn_keys))
  139. if label in ("image", "figure", "img", "fig"):
  140. image_ = await serving_utils.call_async(
  141. _postprocess_image,
  142. subitem[label]["img"],
  143. request_id=request_id,
  144. filename=f"image_{i}_{j}.jpg",
  145. file_storage_config=ctx.extra["file_storage_config"],
  146. )
  147. text = subitem[label]["image_text"]
  148. else:
  149. image_ = None
  150. text = subitem[label]
  151. layout_elements.append(
  152. LayoutElement(
  153. bbox=subitem["layout_bbox"],
  154. label=label,
  155. text=text,
  156. layoutType=subitem["layout"],
  157. image=image_,
  158. )
  159. )
  160. layout_parsing_results.append(
  161. LayoutParsingResult(layoutElements=layout_elements)
  162. )
  163. return ResultResponse(
  164. logId=serving_utils.generate_log_id(),
  165. errorCode=0,
  166. errorMsg="Success",
  167. result=InferResult(
  168. layoutParsingResults=layout_parsing_results,
  169. ),
  170. )
  171. except Exception as e:
  172. logging.exception(e)
  173. raise HTTPException(status_code=500, detail="Internal server error")
  174. return app