Browse Source

fix equation replace type

赵小蒙 1 year ago
parent
commit
7dcf1b7cc5
2 changed files with 32 additions and 18 deletions
  1. 26 12
      magic_pdf/pdf_parse_by_txt_v2.py
  2. 6 6
      magic_pdf/pre_proc/equations_replace.py

+ 26 - 12
magic_pdf/pdf_parse_by_txt_v2.py

@@ -21,11 +21,7 @@ from magic_pdf.pre_proc.ocr_span_list_modify import (
     remove_overlaps_min_spans,
     get_qa_need_list_v2,
 )
-from magic_pdf.pre_proc.equations_replace import (
-    combine_chars_to_pymudict,
-    remove_chars_in_text_blocks,
-    replace_equations_in_textblock,
-)
+
 from magic_pdf.pre_proc.equations_replace import (
     combine_chars_to_pymudict,
     remove_chars_in_text_blocks,
@@ -55,16 +51,34 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
                 bbox = span["bbox"]
                 if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
                     continue
-                spans.append(
-                    {
-                        "bbox": list(span["bbox"]),
-                        "content": span["text"],
-                        "type": ContentType.Text,
-                    }
-                )
+                if span.get('type') == ContentType.InlineEquation:
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": span["latex"],
+                            "type": ContentType.InlineEquation,
+                        }
+                    )
+                elif span.get('type') == ContentType.InterlineEquation:
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": span["latex"],
+                            "type": ContentType.InterlineEquation,
+                        }
+                    )
+                else:
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": span["text"],
+                            "type": ContentType.Text,
+                        }
+                    )
     return spans
 
 
+
 def replace_text_span(pymu_spans, ocr_spans):
     return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
 

+ 6 - 6
magic_pdf/pre_proc/equations_replace.py

@@ -191,13 +191,13 @@ def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
                     "spans": [
                         {
                             "size": 9.962599754333496,
-                            "_type": TYPE_INTERLINE_EQUATION,
+                            "type": TYPE_INTERLINE_EQUATION,
                             "flags": 4,
                             "font": TYPE_INTERLINE_EQUATION,
                             "color": 0,
                             "ascender": 0.9409999847412109,
                             "descender": -0.3050000071525574,
-                            "text": f"\n$$\n{latex_content}\n$$\n",
+                            "latex": latex_content,
                             "origin": [bbox[0], bbox[1]],
                             "bbox": bbox,
                         }
@@ -309,13 +309,13 @@ def replace_line_v2(eqinfo, line):
 
     equation_span = {
         "size": 9.962599754333496,
-        "_type": TYPE_INLINE_EQUATION,
+        "type": TYPE_INLINE_EQUATION,
         "flags": 4,
         "font": TYPE_INLINE_EQUATION,
         "color": 0,
         "ascender": 0.9409999847412109,
         "descender": -0.3050000071525574,
-        "text": "",
+        "latex": "",
         "origin": [337.1410153102337, 216.0205245153934],
         "bbox": [
             337.1410153102337,
@@ -325,11 +325,11 @@ def replace_line_v2(eqinfo, line):
         ],
     }
     # equation_span = line['spans'][0].copy()
-    equation_span["text"] = f" ${eqinfo['latex']}$ "
+    equation_span["latex"] = eqinfo['latex']
     equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
     equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
     equation_span["chars"] = delete_chars
-    equation_span["_type"] = TYPE_INLINE_EQUATION
+    equation_span["type"] = TYPE_INLINE_EQUATION
     equation_span["_eq_bbox"] = eqinfo["bbox"]
     line["spans"].insert(first_overlap_span_idx + 1, equation_span)  # 放入公式