|
|
@@ -10,6 +10,9 @@ from magic_pdf.pipe.OCRPipe import OCRPipe
|
|
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
|
|
from magic_pdf.pipe.UNIPipe import UNIPipe
|
|
|
|
|
|
+import pandas as pd
|
|
|
+from zhch.html_zhch import read_html_zhch
|
|
|
+
|
|
|
# todo: 设备类型选择 (?)
|
|
|
from dotenv import load_dotenv; load_dotenv()
|
|
|
print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}")
|
|
|
@@ -51,7 +54,7 @@ def json_md_dump(
|
|
|
|
|
|
# 使用Pydantic定义report数据结构
|
|
|
from pydantic import BaseModel
|
|
|
-import pandas as pd
|
|
|
+
|
|
|
import re
|
|
|
from magic_pdf.config.ocr_content_type import BlockType
|
|
|
class Report(BaseModel):
|
|
|
@@ -105,7 +108,11 @@ def save_report(
|
|
|
# 遍历每个block,找到table_body和table_caption
|
|
|
if block['type'] == BlockType.TableBody:
|
|
|
# 将html转换为dataframe
|
|
|
- dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
|
|
|
+ # dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
|
|
|
+ dataframe = read_html_zhch(block['lines'][0]['spans'][0]['html'], custom_args={
|
|
|
+ "colspan_single": ["header", "body"],
|
|
|
+ "number_strip": True
|
|
|
+ })[0]
|
|
|
elif block['type'] == BlockType.TableCaption:
|
|
|
title = block['lines'][0]['spans'][0]['content']
|
|
|
# 如果title不为空,且title的最后一个字符是“表” 或者 结尾lowcase是“table”
|
|
|
@@ -257,7 +264,7 @@ def pdf_parse_main(
|
|
|
if __name__ == '__main__':
|
|
|
current_script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
# demo_names = ['demo1', 'demo2', 'small_ocr']
|
|
|
- demo_names = ['600916_中国黄金_2002年报_88_90']
|
|
|
+ demo_names = ['600916_中国黄金_2002年报_83_94']
|
|
|
for name in demo_names:
|
|
|
file_path = os.path.join(current_script_dir, f'{name}.pdf')
|
|
|
pdf_parse_main(file_path, output_dir='./output.demo')
|