Ver código fonte

加入对html的后处理

zhch158_admin 11 meses atrás
pai
commit
bf42036dcd
1 arquivos alterados com 10 adições e 3 exclusões
  1. 10 3
      zhch/magic_pdf_parse_main_zhch.py

+ 10 - 3
zhch/magic_pdf_parse_main_zhch.py

@@ -10,6 +10,9 @@ from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
 from magic_pdf.pipe.UNIPipe import UNIPipe
 
+import pandas as pd
+from zhch.html_zhch import read_html_zhch
+
 # todo: 设备类型选择 (?)
 from dotenv import load_dotenv; load_dotenv()
 print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}") 
@@ -51,7 +54,7 @@ def json_md_dump(
 
 # 使用Pydantic定义report数据结构
 from pydantic import BaseModel
-import pandas as pd
+
 import re
 from magic_pdf.config.ocr_content_type import BlockType
 class Report(BaseModel):
@@ -105,7 +108,11 @@ def save_report(
                     # 遍历每个block,找到table_body和table_caption
                     if block['type'] == BlockType.TableBody:
                         # 将html转换为dataframe
-                        dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
+                        # dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
+                        dataframe = read_html_zhch(block['lines'][0]['spans'][0]['html'], custom_args={
+                            "colspan_single": ["header", "body"],
+                            "number_strip": True
+                        })[0]
                     elif block['type'] == BlockType.TableCaption:
                         title = block['lines'][0]['spans'][0]['content']
                         # 如果title不为空,且title的最后一个字符是“表” 或者 结尾lowcase是“table”
@@ -257,7 +264,7 @@ def pdf_parse_main(
 if __name__ == '__main__':
     current_script_dir = os.path.dirname(os.path.abspath(__file__))
     # demo_names = ['demo1', 'demo2', 'small_ocr']
-    demo_names = ['600916_中国黄金_2002年报_88_90']
+    demo_names = ['600916_中国黄金_2002年报_83_94']
     for name in demo_names:
         file_path = os.path.join(current_script_dir, f'{name}.pdf')
         pdf_parse_main(file_path, output_dir='./output.demo')