11 meses atrás · bf42036dcd
--- a/zhch/magic_pdf_parse_main_zhch.py
+++ b/zhch/magic_pdf_parse_main_zhch.py
@@ -10,6 +10,9 @@ from magic_pdf.pipe.OCRPipe import OCRPipe
 
				 from magic_pdf.pipe.TXTPipe import TXTPipe
			
 
				 from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				 
			
 
				+import pandas as pd
			
 
				+from zhch.html_zhch import read_html_zhch
			
 
				+
			
 
				 # todo: 设备类型选择 （？）
			
 
				 from dotenv import load_dotenv; load_dotenv()
			
 
				 print(f"os.environ['CUDA_VISIBLE_DEVICES']: {os.environ['CUDA_VISIBLE_DEVICES']}") 
			
@@ -51,7 +54,7 @@ def json_md_dump(
 
				 
			
 
				 # 使用Pydantic定义report数据结构
			
 
				 from pydantic import BaseModel
			
 
				-import pandas as pd
			
 
				+
			
 
				 import re
			
 
				 from magic_pdf.config.ocr_content_type import BlockType
			
 
				 class Report(BaseModel):
			
@@ -105,7 +108,11 @@ def save_report(
 
				                     # 遍历每个block，找到table_body和table_caption
			
 
				                     if block['type'] == BlockType.TableBody:
			
 
				                         # 将html转换为dataframe
			
 
				-                        dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
			
 
				+                        # dataframe = pd.read_html(block['lines'][0]['spans'][0]['html'])[0]
			
 
				+                        dataframe = read_html_zhch(block['lines'][0]['spans'][0]['html'], custom_args={
			
 
				+                            "colspan_single": ["header", "body"],
			
 
				+                            "number_strip": True
			
 
				+                        })[0]
			
 
				                     elif block['type'] == BlockType.TableCaption:
			
 
				                         title = block['lines'][0]['spans'][0]['content']
			
 
				                         # 如果title不为空，且title的最后一个字符是“表” 或者 结尾lowcase是“table”
			
@@ -257,7 +264,7 @@ def pdf_parse_main(
 
				 if __name__ == '__main__':
			
 
				     current_script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				     # demo_names = ['demo1', 'demo2', 'small_ocr']
			
 
				-    demo_names = ['600916_中国黄金_2002年报_88_90']
			
 
				+    demo_names = ['600916_中国黄金_2002年报_83_94']
			
 
				     for name in demo_names:
			
 
				         file_path = os.path.join(current_script_dir, f'{name}.pdf')
			
 
				         pdf_parse_main(file_path, output_dir='./output.demo')