Эх сурвалжийг харах

feat(model): add HTML minification to StructTableModel

- Import 're' module for regular expression operations
- Implement HTML minification for 'output_format=html'
- Add 'minify_html' method to remove unnecessary whitespace and format HTML
myhloli 1 жил өмнө
parent
commit
b5117e72d3

+ 14 - 0
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py

@@ -1,3 +1,5 @@
+import re
+
 import torch
 from struct_eqtable import build_model
 
@@ -28,4 +30,16 @@ class StructTableModel:
             images, output_format=output_format
         )
 
+        if output_format == "html":
+            results = [self.minify_html(html) for html in results]
+
         return results
+
+    def minify_html(self, html):
+        # 移除多余的空白字符
+        html = re.sub(r'\s+', ' ', html)
+        # 移除行尾的空白字符
+        html = re.sub(r'\s*>\s*', '>', html)
+        # 移除标签前的空白字符
+        html = re.sub(r'\s*<\s*', '<', html)
+        return html.strip()