|
@@ -72,7 +72,7 @@ def test_pipeline_with_two_config():
|
|
|
res_json_path = (
|
|
res_json_path = (
|
|
|
Path(__file__).parent / "output" / "test" / "txt" / "test_content_list.json"
|
|
Path(__file__).parent / "output" / "test" / "txt" / "test_content_list.json"
|
|
|
).as_posix()
|
|
).as_posix()
|
|
|
- assert_content(res_json_path)
|
|
|
|
|
|
|
+ assert_content(res_json_path, parse_method="txt")
|
|
|
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
|
|
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
|
|
|
pipeline_doc_analyze(
|
|
pipeline_doc_analyze(
|
|
|
pdf_bytes_list,
|
|
pdf_bytes_list,
|
|
@@ -93,7 +93,7 @@ def test_pipeline_with_two_config():
|
|
|
res_json_path = (
|
|
res_json_path = (
|
|
|
Path(__file__).parent / "output" / "test" / "ocr" / "test_content_list.json"
|
|
Path(__file__).parent / "output" / "test" / "ocr" / "test_content_list.json"
|
|
|
).as_posix()
|
|
).as_posix()
|
|
|
- assert_content(res_json_path)
|
|
|
|
|
|
|
+ assert_content(res_json_path, parse_method="ocr")
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_vlm_transformers_with_default_config():
|
|
def test_vlm_transformers_with_default_config():
|
|
@@ -164,7 +164,7 @@ def test_vlm_transformers_with_default_config():
|
|
|
res_json_path = (
|
|
res_json_path = (
|
|
|
Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json"
|
|
Path(__file__).parent / "output" / "test" / "vlm" / "test_content_list.json"
|
|
|
).as_posix()
|
|
).as_posix()
|
|
|
- assert_content(res_json_path)
|
|
|
|
|
|
|
+ assert_content(res_json_path, parse_method="vlm")
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_infer_result(
|
|
def write_infer_result(
|
|
@@ -238,7 +238,7 @@ def validate_html(html_content):
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
-def assert_content(content_path):
|
|
|
|
|
|
|
+def assert_content(content_path, parse_method="txt"):
|
|
|
content_list = []
|
|
content_list = []
|
|
|
with open(content_path, "r", encoding="utf-8") as file:
|
|
with open(content_path, "r", encoding="utf-8") as file:
|
|
|
content_list = json.load(file)
|
|
content_list = json.load(file)
|
|
@@ -267,19 +267,15 @@ def assert_content(content_path):
|
|
|
)
|
|
)
|
|
|
assert validate_html(content_dict["table_body"])
|
|
assert validate_html(content_dict["table_body"])
|
|
|
target_str_list = [
|
|
target_str_list = [
|
|
|
- "Linear Regression",
|
|
|
|
|
|
|
+ "Model",
|
|
|
|
|
+ "Testing",
|
|
|
|
|
+ "Error",
|
|
|
|
|
+ "Linear",
|
|
|
|
|
+ "Regression",
|
|
|
"0.98740",
|
|
"0.98740",
|
|
|
"1321.2",
|
|
"1321.2",
|
|
|
- "2-order Polynomial",
|
|
|
|
|
- "0.99906",
|
|
|
|
|
- "26.4",
|
|
|
|
|
- "3-order Polynomial",
|
|
|
|
|
- "0.99913",
|
|
|
|
|
- "101.2",
|
|
|
|
|
- "4-order Polynomial",
|
|
|
|
|
- "0.99914",
|
|
|
|
|
- "94.1",
|
|
|
|
|
- "Gray Prediction",
|
|
|
|
|
|
|
+ "Gray",
|
|
|
|
|
+ "Prediction",
|
|
|
"0.00617",
|
|
"0.00617",
|
|
|
"687",
|
|
"687",
|
|
|
]
|
|
]
|
|
@@ -287,8 +283,12 @@ def assert_content(content_path):
|
|
|
for target_str in target_str_list:
|
|
for target_str in target_str_list:
|
|
|
if target_str in content_dict["table_body"]:
|
|
if target_str in content_dict["table_body"]:
|
|
|
correct_count += 1
|
|
correct_count += 1
|
|
|
-
|
|
|
|
|
- assert correct_count > 0.9 * len(target_str_list)
|
|
|
|
|
|
|
+ if parse_method == "txt" or parse_method == "ocr":
|
|
|
|
|
+ assert correct_count > 0.9 * len(target_str_list)
|
|
|
|
|
+ elif parse_method == "vlm":
|
|
|
|
|
+ assert correct_count > 0.7 * len(target_str_list)
|
|
|
|
|
+ else:
|
|
|
|
|
+ assert False
|
|
|
# 公式校验,检测是否含有公式元素
|
|
# 公式校验,检测是否含有公式元素
|
|
|
case "equation":
|
|
case "equation":
|
|
|
type_set.add("equation")
|
|
type_set.add("equation")
|