|
@@ -4,7 +4,7 @@ import os
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
-from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
|
|
|
|
|
|
|
+from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
|
|
|
from magic_pdf.libs.commons import join_path
|
|
from magic_pdf.libs.commons import join_path
|
|
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
|
|
|
|
|
|
@@ -30,12 +30,12 @@ def read_json_file(file_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
- # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
|
|
|
|
|
- # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
|
|
|
|
|
|
|
+ ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
|
|
|
|
|
+ ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
|
|
|
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
|
|
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
|
|
|
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
|
|
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
|
|
|
- ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
|
|
|
|
|
- ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
|
|
|
|
|
|
|
+ # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
|
|
|
|
|
+ # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
|
|
|
try:
|
|
try:
|
|
|
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
|
|
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
|
|
|
pth = Path(ocr_json_file_path)
|
|
pth = Path(ocr_json_file_path)
|
|
@@ -56,8 +56,8 @@ if __name__ == '__main__':
|
|
|
if not os.path.exists(parent_dir):
|
|
if not os.path.exists(parent_dir):
|
|
|
os.makedirs(parent_dir)
|
|
os.makedirs(parent_dir)
|
|
|
|
|
|
|
|
- # markdown_content = mk_nlp_markdown(pdf_info_dict)
|
|
|
|
|
- markdown_content = mk_mm_markdown(pdf_info_dict)
|
|
|
|
|
|
|
+ # markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
|
|
|
|
|
+ markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
|
|
|
|
|
|
|
|
with open(text_content_save_path, "w", encoding="utf-8") as f:
|
|
with open(text_content_save_path, "w", encoding="utf-8") as f:
|
|
|
f.write(markdown_content)
|
|
f.write(markdown_content)
|