ソースを参照

update paddleocr to 2.8+ and add layout score output

赵小蒙 1 年間 前
コミット
a5ff8acea7
2 ファイル変更5 行追加2 行削除
  1. 4 1
      magic_pdf/model/doc_analyze_by_pp_structurev2.py
  2. 1 1
      requirements.txt

+ 4 - 1
magic_pdf/model/doc_analyze_by_pp_structurev2.py

@@ -90,7 +90,10 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
                 line['category_id'] = 2
             else:
                 logger.warning(f"unknown type: {line['type']}")
-            line['score'] = 0.5 + random.random() * 0.5
+
+            # 兼容不输出score的paddleocr版本
+            if line.get("score") is None:
+                line['score'] = 0.5 + random.random() * 0.5
 
             res = line.pop('res', None)
             if res is not None and len(res) > 0:

+ 1 - 1
requirements.txt

@@ -16,4 +16,4 @@ nltk==3.8.1
 s3pathlib>=2.1.1
 pytest
 paddlepaddle
-paddleocr>=2.6.0.3
+paddleocr @ https://github.com/myhloli/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl