1 year ago · d04f3f22f5
--- a/README_zh-CN_v2.md
+++ b/README_zh-CN_v2.md
@@ -91,6 +91,7 @@ MinerU诞生于[书生-浦语](https://github.com/InternLM/InternLM)的预训练
 
				 - 保留原文档的结构，包括标题、段落、列表等
			
 
				 - 提取图像、图片标题、表格、表格标题
			
 
				 - 自动识别文档中的公式并将公式转换成latex
			
 
				+- 自动识别文档中的表格并将表格转换成latex
			
 
				 - 乱码PDF自动检测并启用OCR
			
 
				 - 支持CPU和GPU环境
			
 
				 - 支持windows/linux/mac平台
			
@@ -235,7 +236,7 @@ TODO
 
				 - [ ] 正文中列表识别
			
 
				 - [ ] 正文中代码块识别
			
 
				 - [ ] 目录识别
			
 
				-- [ ] 表格识别
			
 
				+- [x] 表格识别
			
 
				 - [ ] 化学式识别
			
 
				 - [ ] 几何图形识别
			
 
				 
			
@@ -270,6 +271,7 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
 
				 - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
			
 
				 - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
			
 
				 - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
			
 
				+- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
			
 
				 
			
 
				 # Citation
			
 
				 
			
--- a/docs/how_to_download_models_zh_cn.md
+++ b/docs/how_to_download_models_zh_cn.md
@@ -73,5 +73,15 @@ git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
 
				 │       ├── README.md
			
 
				 │       ├── tokenizer_config.json
			
 
				 │       └── tokenizer.json
			
 
				+│── TabRec
			
 
				+│   └─StructEqTable
			
 
				+│       ├── config.json
			
 
				+│       ├── generation_config.json
			
 
				+│       ├── model.safetensors
			
 
				+│       ├── preprocessor_config.json
			
 
				+│       ├── special_tokens_map.json
			
 
				+│       ├── spiece.model
			
 
				+│       ├── tokenizer.json
			
 
				+│       └── tokenizer_config.json 
			
 
				 └── README.md
			
 
				 ```
			
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -253,9 +253,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
 
				         }
			
 
				         for block in para_block['blocks']:
			
 
				             if block['type'] == BlockType.TableBody:
			
 
				-                #TODO
			
 
				                 if block["lines"][0]["spans"][0].get('content', ''):
			
 
				-                    para_content['table_body'] = f"\n {block['lines'][0]['spans'][0]['content']}  \n"
			
 
				+                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['content']}\n$\n\n"
			
 
				                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
			
 
				             if block['type'] == BlockType.TableCaption:
			
 
				                 para_content['table_caption'] = merge_para_with_text(block)
			
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -8,4 +8,4 @@ weights:
 
				   layout: Layout/model_final.pth
			
 
				   mfd: MFD/weights.pt
			
 
				   mfr: MFR/UniMERNet
			
 
				-  table: Table/
			
 
				+  table: TabRec/StructEqTable
			
--- a/requirements-qa.txt
+++ b/requirements-qa.txt
@@ -13,4 +13,5 @@ scikit-learn
 
				 tqdm
			
 
				 htmltabletomd
			
 
				 pypandoc
			
 
				-pyopenssl==24.0.0
			
 
				+pyopenssl==24.0.0
			
 
				+struct-eqtable==0.1.0
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,6 @@ fast-langdetect==0.2.0
 
				 wordninja>=2.0.0
			
 
				 scikit-learn>=1.0.2
			
 
				 pdfminer.six==20231228
			
 
				+pypandoc
			
 
				+struct-eqtable==0.1.0
			
 
				 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.