Преглед на файлове

Explicitly close pdfium doc obj after use (#4362)

Lin Manhui преди 4 месеца
родител
ревизия
804c106c47
променени са 3 файла, в които са добавени 51 реда и са изтрити 41 реда
  1. 22 19
      paddlex/inference/models/formula_recognition/result.py
  2. 19 16
      paddlex/inference/serving/infra/utils.py
  3. 10 6
      paddlex/inference/utils/io/readers.py

+ 22 - 19
paddlex/inference/models/formula_recognition/result.py

@@ -256,25 +256,28 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
         np.ndarray: The resulting image as a NumPy array, or None if the PDF is not single-page.
     """
     pdfDoc = pdfium.PdfDocument(pdf_path)
-    if len(pdfDoc) != 1:
-        return None
-    for page in pdfDoc:
-        rotate = int(0)
-        zoom = 2
-        img = page.render(scale=zoom, rotation=rotate).to_pil()
-        img = img.convert("RGB")
-        img = np.array(img)
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        xywh = crop_white_area(img)
-
-        if xywh is not None:
-            x, y, w, h = xywh
-            img = img[y : y + h, x : x + w]
-            if is_padding:
-                img = cv2.copyMakeBorder(
-                    img, 30, 30, 30, 30, cv2.BORDER_CONSTANT, value=(255, 255, 255)
-                )
-            return img
+    try:
+        if len(pdfDoc) != 1:
+            return None
+        for page in pdfDoc:
+            rotate = int(0)
+            zoom = 2
+            img = page.render(scale=zoom, rotation=rotate).to_pil()
+            img = img.convert("RGB")
+            img = np.array(img)
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+            xywh = crop_white_area(img)
+
+            if xywh is not None:
+                x, y, w, h = xywh
+                img = img[y : y + h, x : x + w]
+                if is_padding:
+                    img = cv2.copyMakeBorder(
+                        img, 30, 30, 30, 30, cv2.BORDER_CONSTANT, value=(255, 255, 255)
+                    )
+                return img
+    finally:
+        pdfDoc.close()
     return None
 
 

+ 19 - 16
paddlex/inference/serving/infra/utils.py

@@ -188,22 +188,25 @@ def read_pdf(
     page_info_list: List[PDFPageInfo] = []
     with _lock:
         doc = pdfium.PdfDocument(bytes_)
-        for page in doc:
-            if max_num_imgs is not None and len(images) >= max_num_imgs:
-                break
-            # TODO: Do not always use zoom=2.0
-            zoom = 2.0
-            deg = 0
-            image = page.render(scale=zoom, rotation=deg).to_pil()
-            image = image.convert("RGB")
-            image = np.array(image)
-            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-            images.append(image)
-            page_info = PDFPageInfo(
-                width=image.shape[1],
-                height=image.shape[0],
-            )
-            page_info_list.append(page_info)
+        try:
+            for page in doc:
+                if max_num_imgs is not None and len(images) >= max_num_imgs:
+                    break
+                # TODO: Do not always use zoom=2.0
+                zoom = 2.0
+                deg = 0
+                image = page.render(scale=zoom, rotation=deg).to_pil()
+                image = image.convert("RGB")
+                image = np.array(image)
+                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+                images.append(image)
+                page_info = PDFPageInfo(
+                    width=image.shape[1],
+                    height=image.shape[0],
+                )
+                page_info_list.append(page_info)
+        finally:
+            doc.close()
     pdf_info = PDFInfo(
         numPages=len(page_info_list),
         pages=page_info_list,

+ 10 - 6
paddlex/inference/utils/io/readers.py

@@ -290,12 +290,16 @@ class PDFReaderBackend(_BaseReaderBackend):
         self._scale = zoom
 
     def read_file(self, in_path):
-        for page in pdfium.PdfDocument(in_path):
-            image = page.render(scale=self._scale, rotation=self._rotation).to_pil()
-            image = image.convert("RGB")
-            img_cv = np.array(image)
-            img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
-            yield img_cv
+        doc = pdfium.PdfDocument(in_path)
+        try:
+            for page in doc:
+                image = page.render(scale=self._scale, rotation=self._rotation).to_pil()
+                image = image.convert("RGB")
+                img_cv = np.array(image)
+                img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
+                yield img_cv
+        finally:
+            doc.close()
 
 
 class TXTReaderBackend(_BaseReaderBackend):