Bläddra i källkod

# add table recognition using struct-eqtable
## Changelog
31/07/20204
- Support table recognition. Table images will be converted into LaTex.

### how to use the new feature:
set the attribute 'table-mode' to 'true' in magic-pdf.json

### caution:
it takes 200s to 500s to convert a single table image using cpu

liukaiwen 1 år sedan
förälder
incheckning
d6c58ecca2

+ 1 - 1
magic_pdf/dict2md/ocr_mkcontent.py

@@ -130,7 +130,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                                 if span['type'] == ContentType.Table:
                                     # if processed by table model
                                     if span.get('content', ''):
-                                        para_text += f"\n {span['content']}  \n"
+                                        para_text += f"\n\n$\n {span['content']}\n$\n\n"
                                     else:
                                         para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote

+ 1 - 5
magic_pdf/model/magic_model.py

@@ -561,13 +561,9 @@ class MagicModel:
                     span["type"] = ContentType.Image
                 elif category_id == 5:
                     # 获取table模型结果
-                    html = layout_det.get("html", None)
                     latex = layout_det.get("latex", None)
-                    if html:
-                        span["content"] = html
-                    elif latex:
+                    if latex:
                         span["content"] = latex
-
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]

+ 1 - 7
magic_pdf/model/pdf_extract_kit.py

@@ -287,13 +287,7 @@ class CustomPEKModel:
                     end_time = time.time()
                     run_time = end_time - start_time
                     print(f"------------table recognition processing ends within {run_time}s-----")
+                    layout["latex"] = latex_code
 
-                    # try to convert latex to html
-                    try:
-                        html_code = convert_text(latex_code, 'html', format='latex')
-                        layout["html"] = html_code
-                    except Exception as e:
-                        layout["latex"] = latex_code
-                        logger.error(f"[pdf_extract_kit][CustomPEKModel]: converting latex to html failed: {e}")
 
         return layout_res