3 tháng trước cách đây · 27e2ea44b1
--- a/mineru/model/table/rec/slanet_plus/matcher_utils.py
+++ b/mineru/model/table/rec/slanet_plus/matcher_utils.py
@@ -28,20 +28,20 @@ def deal_isolate_span(thead_part):
 
				     """
			
 
				     # 1. find out isolate span tokens.
			
 
				     isolate_pattern = (
			
 
				-        '<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
			
 
				-        '<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
			
 
				-        '<td></td> rowspan="(\d)+"></b></td>|'
			
 
				-        '<td></td> colspan="(\d)+"></b></td>'
			
 
				+        r"<td></td> rowspan='(\d)+' colspan='(\d)+'></b></td>|"
			
 
				+        r"<td></td> colspan='(\d)+' rowspan='(\d)+'></b></td>|"
			
 
				+        r"<td></td> rowspan='(\d)+'></b></td>|"
			
 
				+        r"<td></td> colspan='(\d)+'></b></td>"
			
 
				     )
			
 
				     isolate_iter = re.finditer(isolate_pattern, thead_part)
			
 
				     isolate_list = [i.group() for i in isolate_iter]
			
 
				 
			
 
				-    # 2. find out span number, by step 1 results.
			
 
				+    # 2. find out span number, by step 1 result.
			
 
				     span_pattern = (
			
 
				-        ' rowspan="(\d)+" colspan="(\d)+"|'
			
 
				-        ' colspan="(\d)+" rowspan="(\d)+"|'
			
 
				-        ' rowspan="(\d)+"|'
			
 
				-        ' colspan="(\d)+"'
			
 
				+        r" rowspan='(\d)+' colspan='(\d)+'|"
			
 
				+        r" colspan='(\d)+' rowspan='(\d)+'|"
			
 
				+        r" rowspan='(\d)+'|"
			
 
				+        r" colspan='(\d)+'"
			
 
				     )
			
 
				     corrected_list = []
			
 
				     for isolate_item in isolate_list:
			
@@ -72,11 +72,11 @@ def deal_duplicate_bb(thead_part):
 
				     """
			
 
				     # 1. find out <td></td> in <thead></thead>.
			
 
				     td_pattern = (
			
 
				-        '<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
			
 
				-        '<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
			
 
				-        '<td rowspan="(\d)+">(.+?)</td>|'
			
 
				-        '<td colspan="(\d)+">(.+?)</td>|'
			
 
				-        "<td>(.*?)</td>"
			
 
				+        r"<td rowspan='(\d)+' colspan='(\d)+'>(.+?)</td>|"
			
 
				+        r"<td colspan='(\d)+' rowspan='(\d)+'>(.+?)</td>|"
			
 
				+        r"<td rowspan='(\d)+'>(.+?)</td>|"
			
 
				+        r"<td colspan='(\d)+'>(.+?)</td>|"
			
 
				+        r"<td>(.*?)</td>"
			
 
				     )
			
 
				     td_iter = re.finditer(td_pattern, thead_part)
			
 
				     td_list = [t.group() for t in td_iter]
			
@@ -115,7 +115,7 @@ def deal_bb(result_token):
 
				     origin_thead_part = copy.deepcopy(thead_part)
			
 
				 
			
 
				     # check "rowspan" or "colspan" occur in <thead></thead> parts or not .
			
 
				-    span_pattern = '<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
			
 
				+    span_pattern = r"<td rowspan='(\d)+' colspan='(\d)+'>|<td colspan='(\d)+' rowspan='(\d)+'>|<td rowspan='(\d)+'>|<td colspan='(\d)+'>"
			
 
				     span_iter = re.finditer(span_pattern, thead_part)
			
 
				     span_list = [s.group() for s in span_iter]
			
 
				     has_span_in_head = True if len(span_list) > 0 else False