소스 검색

重构目录结构

赵小蒙 1 년 전
부모
커밋
f99149b8dd
100개의 변경된 파일132개의 추가작업 그리고 208개의 파일을 삭제
  1. 1 1
      .gitignore
  2. 1 1
      demo/download.py
  3. 1 1
      demo/draw_bbox.py
  4. 3 3
      demo/pdf2md.py
  5. 1 1
      othoers/check_inline_formula.py
  6. 7 7
      othoers/pdf2json_infer.py
  7. 1 5
      othoers/pdf2text_evaluatePdfLayout.py
  8. 1 1
      othoers/pdf2text_getNumberOfColumn.py
  9. 2 9
      othoers/pdf2text_recogFootnoteLine.py
  10. 3 3
      othoers/pdf2text_recogPara_v2.py
  11. 1 5
      othoers/pdf2text_recogTitle.py
  12. 2 2
      othoers/vali_bbox_sort.py
  13. 0 0
      pdf_tools/__init__.py
  14. 0 0
      pdf_tools/dict2md/__init__.py
  15. 1 2
      pdf_tools/dict2md/mkcontent.py
  16. 0 0
      pdf_tools/filter/__init__.py
  17. 2 2
      pdf_tools/filter/pdf_classify_by_type.py
  18. 4 6
      pdf_tools/filter/pdf_meta_scan.py
  19. 0 0
      pdf_tools/layout/__init__.py
  20. 3 3
      pdf_tools/layout/bbox_sort.py
  21. 2 2
      pdf_tools/layout/layout_det_utils.py
  22. 3 4
      pdf_tools/layout/layout_sort.py
  23. 2 2
      pdf_tools/layout/layout_spiler_recog.py
  24. 1 2
      pdf_tools/layout/mcol_sort.py
  25. 0 0
      pdf_tools/libs/__init__.py
  26. 0 0
      pdf_tools/libs/boxbase.py
  27. 0 0
      pdf_tools/libs/calc_span_stats.py
  28. 0 0
      pdf_tools/libs/commons.py
  29. 0 0
      pdf_tools/libs/drop_reason.py
  30. 0 0
      pdf_tools/libs/drop_tag.py
  31. 0 0
      pdf_tools/libs/json_compressor.py
  32. 0 0
      pdf_tools/libs/language.py
  33. 0 0
      pdf_tools/libs/markdown_utils.py
  34. 1 1
      pdf_tools/libs/nlp_utils.py
  35. 2 2
      pdf_tools/libs/pdf_image_tools.py
  36. 0 0
      pdf_tools/libs/safe_filename.py
  37. 0 0
      pdf_tools/libs/textbase.py
  38. 1 3
      pdf_tools/libs/vis_utils.py
  39. 0 0
      pdf_tools/para/__init__.py
  40. 1 2
      pdf_tools/para/block_continuation_processor.py
  41. 1 7
      pdf_tools/para/block_termination_processor.py
  42. 1 1
      pdf_tools/para/commons.py
  43. 1 2
      pdf_tools/para/denoise.py
  44. 2 4
      pdf_tools/para/draw.py
  45. 0 0
      pdf_tools/para/exceptions.py
  46. 1 2
      pdf_tools/para/layout_match_processor.py
  47. 11 12
      pdf_tools/para/para_pipeline.py
  48. 1 3
      pdf_tools/para/raw_processor.py
  49. 2 3
      pdf_tools/para/stats.py
  50. 2 3
      pdf_tools/para/title_processor.py
  51. 0 0
      pdf_tools/pipeline/__init__.py
  52. 33 33
      pdf_tools/pipeline/pdf_parse_by_model.py
  53. 0 0
      pdf_tools/post_proc/__init__.py
  54. 3 3
      pdf_tools/post_proc/detect_para.py
  55. 2 2
      pdf_tools/post_proc/pdf_post_filter.py
  56. 1 1
      pdf_tools/post_proc/remove_footnote.py
  57. 0 0
      pdf_tools/pre_proc/__init__.py
  58. 1 2
      pdf_tools/pre_proc/citationmarker_remove.py
  59. 0 0
      pdf_tools/pre_proc/construct_paras.py
  60. 2 8
      pdf_tools/pre_proc/detect_equation.py
  61. 1 5
      pdf_tools/pre_proc/detect_footer_by_model.py
  62. 1 3
      pdf_tools/pre_proc/detect_footer_header_by_statistics.py
  63. 1 4
      pdf_tools/pre_proc/detect_footnote.py
  64. 1 5
      pdf_tools/pre_proc/detect_header.py
  65. 1 4
      pdf_tools/pre_proc/detect_images.py
  66. 1 5
      pdf_tools/pre_proc/detect_page_number.py
  67. 1 5
      pdf_tools/pre_proc/detect_tables.py
  68. 1 1
      pdf_tools/pre_proc/equations_replace.py
  69. 2 3
      pdf_tools/pre_proc/fix_image.py
  70. 2 6
      pdf_tools/pre_proc/fix_table.py
  71. 0 0
      pdf_tools/pre_proc/main_text_font.py
  72. 3 3
      pdf_tools/pre_proc/pdf_pre_filter.py
  73. 0 0
      pdf_tools/pre_proc/post_layout_split.py
  74. 2 2
      pdf_tools/pre_proc/remove_colored_strip_bbox.py
  75. 1 1
      pdf_tools/pre_proc/remove_footer_header.py
  76. 2 4
      pdf_tools/pre_proc/remove_rotate_bbox.py
  77. 1 1
      pdf_tools/pre_proc/resolve_bbox_conflict.py
  78. 0 0
      pdf_tools/pre_proc/statistics.py
  79. 0 0
      tests/assets/more_para_test_samples/gift_files.txt
  80. 0 0
      tests/assets/more_para_test_samples/scihub_files.txt
  81. 0 0
      tests/assets/more_para_test_samples/zlib_files.txt
  82. 0 0
      tests/assets/paper/images_tables_equations.json
  83. 0 0
      tests/assets/paper/paper.pdf
  84. 0 0
      tests/assets/paper/paper_recogPara.json
  85. 0 0
      tests/assets/paper/paper_recogPara.pdf
  86. 0 0
      tests/assets/paper/pdf_dic.json
  87. 0 0
      tests/assets/paras_test.json
  88. 0 0
      tests/assets/pdf_text_example/vertical_blocks.json
  89. 0 0
      tests/assets/pdf_text_example/vertical_en_blocks.json
  90. 0 0
      tests/assets/pre_proc_results/2列_ViLT_1_title.pdf/preproc_out.json
  91. 0 0
      tests/assets/pre_proc_results/arxiv_2011.13925/preproc_out.json
  92. 0 0
      tests/assets/pre_proc_results/p3_图文混排_5.pdf/preproc_out.json
  93. 0 0
      tests/assets/pre_proc_results/p3_图文混排_6.pdf/preproc_out.json
  94. 0 0
      tests/assets/pre_proc_results/p3_图文混排_84.pdf/preproc_out.json
  95. 0 0
      tests/assets/pre_proc_results/scihub_10800000/preproc_out.json
  96. 0 0
      tests/assets/pre_proc_results/scihub_46600000/preproc_out.json
  97. 0 0
      tests/assets/pre_proc_results/scihub_60900000/preproc_out.json
  98. 0 0
      tests/assets/pre_proc_results/scihub_76800000/preproc_out.json
  99. 0 0
      tests/assets/pre_proc_results/the_eye_cdn_00412782/preproc_out.json
  100. 0 0
      tests/assets/pre_proc_results/中文单列_书籍_100247_4_装饰图片.pdf/preproc_out.json

+ 1 - 1
.gitignore

@@ -31,6 +31,6 @@ tmp/
 tmp
 .vscode
 .vscode/
-/test/
+/tests/
 
 /app/pdf_toolbox/test/test_bookname.txt

+ 1 - 1
demo/download.py

@@ -2,7 +2,7 @@ import json
 import os
 from tqdm import tqdm
 
-from libs.commons import join_path
+from pdf_tools.libs import join_path
 
 with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
     samples = json.load(f)

+ 1 - 1
demo/draw_bbox.py

@@ -1,4 +1,4 @@
-from libs.commons import fitz  # PyMuPDF
+from pdf_tools.libs import fitz  # PyMuPDF
 
 # PDF文件路径
 pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"

+ 3 - 3
demo/pdf2md.py

@@ -5,9 +5,9 @@ from pathlib import Path
 import click
 from loguru import logger
 
-from libs.commons import join_path
-from dict2md.mkcontent import mk_mm_markdown
-from pipeline.pdf_parse_by_model import parse_pdf_by_model
+from pdf_tools.libs import join_path
+from pdf_tools.dict2md.mkcontent import mk_mm_markdown
+from pdf_tools.pipeline import parse_pdf_by_model
 
 
 

+ 1 - 1
othoers/check_inline_formula.py

@@ -1,5 +1,5 @@
 # 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
-from libs.commons import fitz
+from pdf_tools.libs import fitz
 
 
 def check_inline_formula(page, inline_formula_boxes):

+ 7 - 7
othoers/pdf2json_infer.py

@@ -3,7 +3,7 @@ from typing import Tuple
 import os
 import boto3, json
 from botocore.config import Config
-from libs.commons import fitz
+from pdf_tools.libs import fitz
 from loguru import logger
 from pathlib import Path
 from tqdm import tqdm
@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
 # from pdf2text_recogPara import parse_blocks_per_page    
 # from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
 
-from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
-from pre_proc.detect_images import parse_images          # 获取figures的bbox
-from pre_proc.detect_tables import parse_tables           # 获取tables的bbox
-from pre_proc.detect_equation import parse_equations     # 获取equations的bbox
+from pdf_tools.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
+from pdf_tools.pre_proc import parse_images          # 获取figures的bbox
+from pdf_tools.pre_proc.detect_tables import parse_tables           # 获取tables的bbox
+from pdf_tools.pre_proc import parse_equations     # 获取equations的bbox
 # from pdf2text_recogFootnote import parse_footnotes     # 获取footnotes的bbox
-from post_proc.detect_para import process_blocks_per_page
-from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
+from pdf_tools.post_proc.detect_para import process_blocks_per_page
+from pdf_tools.libs import parse_aws_param, parse_bucket_key, read_file, join_path
 
 
 def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):

+ 1 - 5
othoers/pdf2text_evaluatePdfLayout.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs import fitz             # pyMuPDF库
 
 
 def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):

+ 1 - 1
othoers/pdf2text_getNumberOfColumn.py

@@ -1,4 +1,4 @@
-from libs.commons import fitz
+from pdf_tools.libs import fitz
 from typing import List
 
 

+ 2 - 9
othoers/pdf2text_recogFootnoteLine.py

@@ -1,14 +1,7 @@
-import io
 import re
-import os
-import json
-from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
-from libs.commons import fitz
-from fitz import Point
-from pprint import pprint
-import pickle
+from pdf_tools.libs import _is_in_or_part_overlap
+from pdf_tools.libs import fitz
 import collections
-from typing import List
 
 
 def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):

+ 3 - 3
othoers/pdf2text_recogPara_v2.py

@@ -11,8 +11,8 @@ import numpy as np
 from termcolor import cprint
 
 
-from libs.commons import fitz
-from libs.nlp_utils import NLPModels
+from pdf_tools.libs import fitz
+from pdf_tools.libs import NLPModels
 
 
 if sys.version_info[0] >= 3:
@@ -3478,7 +3478,7 @@ Params:
 
 if __name__ == "__main__":
     DEFAULT_PDF_PATH = (
-        "app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf"
+        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
     )
     input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
     output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"

+ 1 - 5
othoers/pdf2text_recogTitle.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):

+ 2 - 2
othoers/vali_bbox_sort.py

@@ -1,8 +1,8 @@
 import numpy as np
 import tqdm
 import json
-from validation import cal_edit_distance, format_gt_bbox, label_match, detect_val
-from layout.layout_sort import sort_with_layout
+from validation import cal_edit_distance, format_gt_bbox
+from pdf_tools.layout.layout_sort import sort_with_layout
 
 with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
     samples = json.load(f)

+ 0 - 0
dict2md/__init__.py → pdf_tools/__init__.py


+ 0 - 0
filter/__init__.py → pdf_tools/dict2md/__init__.py


+ 1 - 2
dict2md/mkcontent.py → pdf_tools/dict2md/mkcontent.py

@@ -1,8 +1,7 @@
-import re
 import math
 from loguru import logger
 
-from libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
+from pdf_tools.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
 
 
 def mk_nlp_markdown(para_dict: dict):

+ 0 - 0
layout/__init__.py → pdf_tools/filter/__init__.py


+ 2 - 2
filter/pdf_classify_by_type.py → pdf_tools/filter/pdf_classify_by_type.py

@@ -16,8 +16,8 @@ from collections import Counter
 import click
 import numpy as np
 
-from libs.commons import mymax, get_top_percent_list
-from filter.pdf_meta_scan import scan_max_page, junk_limit_min
+from pdf_tools.libs.commons import mymax, get_top_percent_list
+from pdf_tools.filter.pdf_meta_scan import scan_max_page, junk_limit_min
 
 TEXT_LEN_THRESHOLD = 100
 AVG_TEXT_LEN_THRESHOLD = 200

+ 4 - 6
filter/pdf_meta_scan.py → pdf_tools/filter/pdf_meta_scan.py

@@ -2,18 +2,16 @@
 输入: s3路径,每行一个
 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
 """
-import math
 import sys
 import click
 
-from libs.commons import read_file, mymax, get_top_percent_list
-import json
-from libs.commons import fitz
+from pdf_tools.libs.commons import read_file, mymax, get_top_percent_list
+from pdf_tools.libs.commons import fitz
 from loguru import logger
 from collections import Counter
 
-from libs.drop_reason import DropReason
-from libs.language import detect_lang
+from pdf_tools.libs.drop_reason import DropReason
+from pdf_tools.libs.language import detect_lang
 
 scan_max_page = 50
 junk_limit_min = 10

+ 0 - 0
libs/__init__.py → pdf_tools/layout/__init__.py


+ 3 - 3
layout/bbox_sort.py → pdf_tools/layout/bbox_sort.py

@@ -3,9 +3,9 @@
 
 
 
-from layout.layout_spiler_recog import get_spilter_of_page
-from libs.boxbase import _is_bottom_full_overlap, _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
-from libs.commons import mymax
+from pdf_tools.layout.layout_spiler_recog import get_spilter_of_page
+from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
+from pdf_tools.libs.commons import mymax
 
 X0_IDX = 0
 Y0_IDX = 1

+ 2 - 2
layout/layout_det_utils.py → pdf_tools/layout/layout_det_utils.py

@@ -1,5 +1,5 @@
-from layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
-from libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
+from pdf_tools.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
+from pdf_tools.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
 
 
 def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:

+ 3 - 4
layout/layout_sort.py → pdf_tools/layout/layout_sort.py

@@ -2,11 +2,10 @@
 对pdf上的box进行layout识别,并对内部组成的box进行排序
 """
 
-import json
 from loguru import logger
-from layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
-from layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
-from libs.boxbase import get_bbox_in_boundry
+from pdf_tools.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
+from pdf_tools.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
+from pdf_tools.libs.boxbase import get_bbox_in_boundry
 
 
 LAYOUT_V = "V"

+ 2 - 2
layout/layout_spiler_recog.py → pdf_tools/layout/layout_spiler_recog.py

@@ -3,8 +3,8 @@
 """
 
 import os
-from libs.commons import fitz
-from libs.boxbase import _is_in_or_part_overlap
+from pdf_tools.libs.commons import fitz
+from pdf_tools.libs.boxbase import _is_in_or_part_overlap
 
 
 def __rect_filter_by_width(rect, page_w, page_h):

+ 1 - 2
layout/mcol_sort.py → pdf_tools/layout/mcol_sort.py

@@ -49,9 +49,8 @@ Usage
       print(page.get_text(clip=rect, sort=True))
   ----------------------------------------------------------------------------------
 """
-import os
 import sys
-from libs.commons import fitz
+from pdf_tools.libs.commons import fitz
 
 
 def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):

+ 0 - 0
para/__init__.py → pdf_tools/libs/__init__.py


+ 0 - 0
libs/boxbase.py → pdf_tools/libs/boxbase.py


+ 0 - 0
libs/calc_span_stats.py → pdf_tools/libs/calc_span_stats.py


+ 0 - 0
libs/commons.py → pdf_tools/libs/commons.py


+ 0 - 0
libs/drop_reason.py → pdf_tools/libs/drop_reason.py


+ 0 - 0
libs/drop_tag.py → pdf_tools/libs/drop_tag.py


+ 0 - 0
libs/json_compressor.py → pdf_tools/libs/json_compressor.py


+ 0 - 0
libs/language.py → pdf_tools/libs/language.py


+ 0 - 0
libs/markdown_utils.py → pdf_tools/libs/markdown_utils.py


+ 1 - 1
libs/nlp_utils.py → pdf_tools/libs/nlp_utils.py

@@ -10,7 +10,7 @@ import spacy
 import en_core_web_sm
 import zh_core_web_sm
 
-from libs.language import detect_lang
+from pdf_tools.libs.language import detect_lang
 
 
 class NLPModels:

+ 2 - 2
libs/pdf_image_tools.py → pdf_tools/libs/pdf_image_tools.py

@@ -4,9 +4,9 @@ from typing import Tuple
 import io
 
 # from app.common.s3 import get_s3_client
-from libs.commons import fitz
+from pdf_tools.libs.commons import fitz
 from loguru import logger
-from libs.commons import parse_bucket_key, join_path
+from pdf_tools.libs.commons import parse_bucket_key, join_path
 
 
 def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):

+ 0 - 0
libs/safe_filename.py → pdf_tools/libs/safe_filename.py


+ 0 - 0
libs/textbase.py → pdf_tools/libs/textbase.py


+ 1 - 3
libs/vis_utils.py → pdf_tools/libs/vis_utils.py

@@ -1,7 +1,5 @@
-from libs.commons import fitz
+from pdf_tools.libs.commons import fitz
 import os
-from loguru import logger
-from layout.bbox_sort import CONTENT_TYPE_IDX
 
 
 def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):

+ 0 - 0
pipeline/__init__.py → pdf_tools/para/__init__.py


+ 1 - 2
para/block_continuation_processor.py → pdf_tools/para/block_continuation_processor.py

@@ -1,8 +1,7 @@
 import os
-import sys
 import unicodedata
 
-from para.commons import *
+from pdf_tools.para.commons import *
 
 
 if sys.version_info[0] >= 3:

+ 1 - 7
para/block_termination_processor.py → pdf_tools/para/block_termination_processor.py

@@ -1,10 +1,4 @@
-import sys
-
-from libs.commons import fitz
-
-from termcolor import cprint
-
-from para.commons import *
+from pdf_tools.para.commons import *
 
 
 if sys.version_info[0] >= 3:

+ 1 - 1
para/commons.py → pdf_tools/para/commons.py

@@ -1,6 +1,6 @@
 import sys
 
-from libs.commons import fitz
+from pdf_tools.libs.commons import fitz
 from termcolor import cprint
 
 

+ 1 - 2
para/denoise.py → pdf_tools/para/denoise.py

@@ -1,8 +1,7 @@
-import sys
 import math
 
 from collections import defaultdict
-from para.commons import *
+from pdf_tools.para.commons import *
 
 if sys.version_info[0] >= 3:
     sys.stdout.reconfigure(encoding="utf-8")  # type: ignore

+ 2 - 4
para/draw.py → pdf_tools/para/draw.py

@@ -1,8 +1,6 @@
-import sys
+from pdf_tools.libs.commons import fitz
 
-from libs.commons import fitz
-
-from para.commons import *
+from pdf_tools.para.commons import *
 
 
 if sys.version_info[0] >= 3:

+ 0 - 0
para/exceptions.py → pdf_tools/para/exceptions.py


+ 1 - 2
para/layout_match_processor.py → pdf_tools/para/layout_match_processor.py

@@ -1,6 +1,5 @@
-import sys
 import math
-from para.commons import *
+from pdf_tools.para.commons import *
 
 
 if sys.version_info[0] >= 3:

+ 11 - 12
para/para_pipeline.py → pdf_tools/para/para_pipeline.py

@@ -1,18 +1,17 @@
 import os
-import sys
 import json
 
-from para.commons import *
-
-from para.raw_processor import RawBlockProcessor
-from para.layout_match_processor import LayoutFilterProcessor
-from para.stats import BlockStatisticsCalculator
-from para.stats import DocStatisticsCalculator
-from para.title_processor import TitleProcessor
-from para.block_termination_processor import BlockTerminationProcessor
-from para.block_continuation_processor import BlockContinuationProcessor
-from para.draw import DrawAnnos
-from para.exceptions import (
+from pdf_tools.para.commons import *
+
+from pdf_tools.para.raw_processor import RawBlockProcessor
+from pdf_tools.para.layout_match_processor import LayoutFilterProcessor
+from pdf_tools.para.stats import BlockStatisticsCalculator
+from pdf_tools.para.stats import DocStatisticsCalculator
+from pdf_tools.para.title_processor import TitleProcessor
+from pdf_tools.para.block_termination_processor import BlockTerminationProcessor
+from pdf_tools.para.block_continuation_processor import BlockContinuationProcessor
+from pdf_tools.para.draw import DrawAnnos
+from pdf_tools.para.exceptions import (
     DenseSingleLineBlockException,
     TitleDetectionException,
     TitleLevelException,

+ 1 - 3
para/raw_processor.py → pdf_tools/para/raw_processor.py

@@ -1,5 +1,3 @@
-from para.commons import *
-
 class RawBlockProcessor:
     def __init__(self) -> None:
         self.y_tolerance = 2
@@ -186,7 +184,7 @@ class RawBlockProcessor:
             The instance of the class.
         ----------
         blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json.
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
 
         Returns
         -------

+ 2 - 3
para/stats.py → pdf_tools/para/stats.py

@@ -1,8 +1,7 @@
-import sys
 from collections import Counter
 import numpy as np
 
-from para.commons import *
+from pdf_tools.para.commons import *
 
 
 if sys.version_info[0] >= 3:
@@ -149,7 +148,7 @@ class BlockStatisticsCalculator:
             The instance of the class.
         ----------
         blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
 
         Returns
         -------

+ 2 - 3
para/title_processor.py → pdf_tools/para/title_processor.py

@@ -1,11 +1,10 @@
 import os
-import sys
 import re
 import numpy as np
 
-from libs.nlp_utils import NLPModels
+from pdf_tools.libs.nlp_utils import NLPModels
 
-from para.commons import *
+from pdf_tools.para.commons import *
 
 if sys.version_info[0] >= 3:
     sys.stdout.reconfigure(encoding="utf-8")  # type: ignore

+ 0 - 0
post_proc/__init__.py → pdf_tools/pipeline/__init__.py


+ 33 - 33
pipeline/pdf_parse_by_model.py → pdf_tools/pipeline/pdf_parse_by_model.py

@@ -2,28 +2,28 @@ import time
 
 # from anyio import Path
 
-from libs.commons import fitz, get_delta_time, get_img_s3_client
+from pdf_tools.libs.commons import fitz, get_delta_time, get_img_s3_client
 import json
 import os
 import math
 from loguru import logger
-from layout.bbox_sort import (
+from pdf_tools.layout.bbox_sort import (
     prepare_bboxes_for_layout_split,
 )
-from layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
-from libs.drop_reason import DropReason
-from libs.markdown_utils import escape_special_markdown_char
-from libs.safe_filename import sanitize_filename
-from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
-from pre_proc.detect_images import parse_images
-from pre_proc.detect_tables import parse_tables  # 获取tables的bbox
-from pre_proc.detect_equation import parse_equations  # 获取equations的bbox
-from pre_proc.detect_header import parse_headers  # 获取headers的bbox
-from pre_proc.detect_page_number import parse_pageNos  # 获取pageNos的bbox
-from pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule  # 获取footnotes的bbox
-from pre_proc.detect_footer_by_model import parse_footers  # 获取footers的bbox
-
-from post_proc.detect_para import (
+from pdf_tools.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
+from pdf_tools.libs.drop_reason import DropReason
+from pdf_tools.libs.markdown_utils import escape_special_markdown_char
+from pdf_tools.libs.safe_filename import sanitize_filename
+from pdf_tools.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
+from pdf_tools.pre_proc.detect_images import parse_images
+from pdf_tools.pre_proc.detect_tables import parse_tables  # 获取tables的bbox
+from pdf_tools.pre_proc.detect_equation import parse_equations  # 获取equations的bbox
+from pdf_tools.pre_proc.detect_header import parse_headers  # 获取headers的bbox
+from pdf_tools.pre_proc.detect_page_number import parse_pageNos  # 获取pageNos的bbox
+from pdf_tools.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule  # 获取footnotes的bbox
+from pdf_tools.pre_proc.detect_footer_by_model import parse_footers  # 获取footers的bbox
+
+from pdf_tools.post_proc.detect_para import (
     ParaProcessPipeline,
     TitleDetectionException,
     TitleLevelException,
@@ -31,9 +31,9 @@ from post_proc.detect_para import (
     ParaMergeException,
     DenseSingleLineBlockException,
 )
-from pre_proc.main_text_font import get_main_text_font
-from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
-from pre_proc.remove_footer_header import remove_headder_footer_one_page
+from pdf_tools.pre_proc.main_text_font import get_main_text_font
+from pdf_tools.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
+from pdf_tools.pre_proc.remove_footer_header import remove_headder_footer_one_page
 
 '''
 from para.para_pipeline import ParaProcessPipeline
@@ -46,19 +46,19 @@ from para.exceptions import (
 )
 '''
 
-from libs.commons import read_file, join_path
-from libs.pdf_image_tools import save_images_by_bboxes
-from post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
-from pre_proc.citationmarker_remove import remove_citation_marker
-from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
-from pre_proc.pdf_pre_filter import pdf_filter
-from pre_proc.detect_footer_header_by_statistics import drop_footer_header
-from pre_proc.construct_paras import construct_page_component
-from pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
-from post_proc.pdf_post_filter import pdf_post_filter
-from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
-from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
-from pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
+from pdf_tools.libs.commons import read_file, join_path
+from pdf_tools.libs.pdf_image_tools import save_images_by_bboxes
+from pdf_tools.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
+from pdf_tools.pre_proc.citationmarker_remove import remove_citation_marker
+from pdf_tools.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
+from pdf_tools.pre_proc.pdf_pre_filter import pdf_filter
+from pdf_tools.pre_proc.detect_footer_header_by_statistics import drop_footer_header
+from pdf_tools.pre_proc.construct_paras import construct_page_component
+from pdf_tools.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
+from pdf_tools.post_proc.pdf_post_filter import pdf_post_filter
+from pdf_tools.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
+from pdf_tools.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
+from pdf_tools.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
 
 denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
 titleDetectionException_msg = TitleDetectionException().message
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
     debug_mode=False,
 ):
     pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
-    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../../..", "tmp", "unittest")
     md_bookname_save_path = ""
     book_name = sanitize_filename(book_name)
     if debug_mode:

+ 0 - 0
pre_proc/__init__.py → pdf_tools/post_proc/__init__.py


+ 3 - 3
post_proc/detect_para.py → pdf_tools/post_proc/detect_para.py

@@ -11,8 +11,8 @@ import numpy as np
 from termcolor import cprint
 
 
-from libs.commons import fitz
-from libs.nlp_utils import NLPModels
+from pdf_tools.libs.commons import fitz
+from pdf_tools.libs.nlp_utils import NLPModels
 
 
 if sys.version_info[0] >= 3:
@@ -3404,7 +3404,7 @@ Params:
 
 if __name__ == "__main__":
     DEFAULT_PDF_PATH = (
-        "app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf"
+        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
     )
     input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
     output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"

+ 2 - 2
post_proc/pdf_post_filter.py → pdf_tools/post_proc/pdf_post_filter.py

@@ -1,7 +1,7 @@
 from loguru import logger
 
-from layout.layout_sort import get_columns_cnt_of_layout
-from libs.drop_reason import DropReason
+from pdf_tools.layout.layout_sort import get_columns_cnt_of_layout
+from pdf_tools.libs.drop_reason import DropReason
 
 
 def __is_pseudo_single_column(page_info) -> bool:

+ 1 - 1
post_proc/remove_footnote.py → pdf_tools/post_proc/remove_footnote.py

@@ -1,4 +1,4 @@
-from libs.boxbase import _is_in, _is_in_or_part_overlap
+from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
 import collections      # 统计库
 
 

+ 0 - 0
pre_proc/post_layout_split.py → pdf_tools/pre_proc/__init__.py


+ 1 - 2
pre_proc/citationmarker_remove.py → pdf_tools/pre_proc/citationmarker_remove.py

@@ -3,8 +3,7 @@
 https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
 """
 import re
-from loguru import logger
-from libs.nlp_utils import NLPModels
+from pdf_tools.libs.nlp_utils import NLPModels
 
 
 __NLP_MODEL = NLPModels()

+ 0 - 0
pre_proc/construct_paras.py → pdf_tools/pre_proc/construct_paras.py


+ 2 - 8
pre_proc/detect_equation.py → pdf_tools/pre_proc/detect_equation.py

@@ -1,11 +1,5 @@
-import os                   
-import collections      # 统计库
-import re
-from libs.boxbase import _is_in               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
-from pathlib import Path
-
+from pdf_tools.libs.boxbase import _is_in               # 正则
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def __solve_contain_bboxs(all_bbox_list: list):

+ 1 - 5
pre_proc/detect_footer_by_model.py → pdf_tools/pre_proc/detect_footer_by_model.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):

+ 1 - 3
pre_proc/detect_footer_header_by_statistics.py → pdf_tools/pre_proc/detect_footer_header_by_statistics.py

@@ -1,8 +1,6 @@
 from collections import defaultdict
 
-from loguru import logger
-
-from libs.boxbase import _is_in, calculate_iou
+from pdf_tools.libs.boxbase import calculate_iou
 
 
 def compare_bbox_with_list(bbox, bbox_list, tolerance=1):

+ 1 - 4
pre_proc/detect_footnote.py → pdf_tools/pre_proc/detect_footnote.py

@@ -1,8 +1,5 @@
-import os
 from collections import Counter
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):

+ 1 - 5
pre_proc/detect_header.py → pdf_tools/pre_proc/detect_header.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):

+ 1 - 4
pre_proc/detect_images.py → pdf_tools/pre_proc/detect_images.py

@@ -1,9 +1,6 @@
-import os                   
 import collections      # 统计库
 import re
-from libs.boxbase import _is_in_or_part_overlap               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 #--------------------------------------- Tool Functions --------------------------------------#

+ 1 - 5
pre_proc/detect_page_number.py → pdf_tools/pre_proc/detect_page_number.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):

+ 1 - 5
pre_proc/detect_tables.py → pdf_tools/pre_proc/detect_tables.py

@@ -1,8 +1,4 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json             # json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 
 
 def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):

+ 1 - 1
pre_proc/equations_replace.py → pdf_tools/pre_proc/equations_replace.py

@@ -1,7 +1,7 @@
 """
 对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
 """
-from libs.commons import fitz
+from pdf_tools.libs.commons import fitz
 import json
 import os
 from pathlib import Path

+ 2 - 3
pre_proc/fix_image.py → pdf_tools/pre_proc/fix_image.py

@@ -2,10 +2,9 @@
 
 
 import re    
-from libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, _is_in, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
-from loguru import logger
+from pdf_tools.libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
 
-from libs.textbase import get_text_block_base_info
+from pdf_tools.libs.textbase import get_text_block_base_info
 
 def fix_image_vertical(image_bboxes:list, text_blocks:list):
     """

+ 2 - 6
pre_proc/fix_table.py → pdf_tools/pre_proc/fix_table.py

@@ -1,11 +1,7 @@
-import os                   
-import collections      # 统计库
-import re               # 正则
-from libs.commons import fitz             # pyMuPDF库
-import json
+from pdf_tools.libs.commons import fitz             # pyMuPDF库
 import re
 
-from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
+from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
 
 
 ## version 2

+ 0 - 0
pre_proc/main_text_font.py → pdf_tools/pre_proc/main_text_font.py


+ 3 - 3
pre_proc/pdf_pre_filter.py → pdf_tools/pre_proc/pdf_pre_filter.py

@@ -1,6 +1,6 @@
-from libs.commons import fitz
-from libs.boxbase import _is_in, _is_in_or_part_overlap
-from libs.drop_reason import DropReason
+from pdf_tools.libs.commons import fitz
+from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
+from pdf_tools.libs.drop_reason import DropReason
 
 
 def __area(box):

+ 0 - 0
test/assets/more_para_test_samples/gift_files.txt → pdf_tools/pre_proc/post_layout_split.py


+ 2 - 2
pre_proc/remove_colored_strip_bbox.py → pdf_tools/pre_proc/remove_colored_strip_bbox.py

@@ -1,7 +1,7 @@
-from libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
+from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
 from loguru import logger
 
-from libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+from pdf_tools.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
 
 
 def __area(box):

+ 1 - 1
pre_proc/remove_footer_header.py → pdf_tools/pre_proc/remove_footer_header.py

@@ -1,6 +1,6 @@
 import re
 
-from libs.boxbase import _is_in_or_part_overlap
+from pdf_tools.libs.boxbase import _is_in_or_part_overlap
 
 
 def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,

+ 2 - 4
pre_proc/remove_rotate_bbox.py → pdf_tools/pre_proc/remove_rotate_bbox.py

@@ -1,8 +1,6 @@
-
-import json
 import math
 
-from libs.boxbase import is_vbox_on_side
+from pdf_tools.libs.boxbase import is_vbox_on_side
 
 
 def detect_non_horizontal_texts(result_dict):
@@ -84,7 +82,7 @@ def detect_non_horizontal_texts(result_dict):
 1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
 2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
 """
-import string, re
+import re
 
 def __is_a_word(sentence):
     # 如果输入是中文并且长度为1,则返回True

+ 1 - 1
pre_proc/resolve_bbox_conflict.py → pdf_tools/pre_proc/resolve_bbox_conflict.py

@@ -5,7 +5,7 @@
 2. 然后去掉出现在文字blcok上的图片bbox
 """
 
-from libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap, calculate_iou, calculate_overlap_area_2_minbox_area_ratio
+from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
 
 
 def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):

+ 0 - 0
pre_proc/statistics.py → pdf_tools/pre_proc/statistics.py


+ 0 - 0
test/assets/more_para_test_samples/zlib_files.txt → tests/assets/more_para_test_samples/gift_files.txt


+ 0 - 0
test/assets/more_para_test_samples/scihub_files.txt → tests/assets/more_para_test_samples/scihub_files.txt


+ 0 - 0
tests/assets/more_para_test_samples/zlib_files.txt


+ 0 - 0
test/assets/paper/images_tables_equations.json → tests/assets/paper/images_tables_equations.json


+ 0 - 0
test/assets/paper/paper.pdf → tests/assets/paper/paper.pdf


+ 0 - 0
test/assets/paper/paper_recogPara.json → tests/assets/paper/paper_recogPara.json


+ 0 - 0
test/assets/paper/paper_recogPara.pdf → tests/assets/paper/paper_recogPara.pdf


+ 0 - 0
test/assets/paper/pdf_dic.json → tests/assets/paper/pdf_dic.json


+ 0 - 0
test/assets/paras_test.json → tests/assets/paras_test.json


+ 0 - 0
test/assets/pdf_text_example/vertical_blocks.json → tests/assets/pdf_text_example/vertical_blocks.json


+ 0 - 0
test/assets/pdf_text_example/vertical_en_blocks.json → tests/assets/pdf_text_example/vertical_en_blocks.json


+ 0 - 0
test/assets/pre_proc_results/2列_ViLT_1_title.pdf/preproc_out.json → tests/assets/pre_proc_results/2列_ViLT_1_title.pdf/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/arxiv_2011.13925/preproc_out.json → tests/assets/pre_proc_results/arxiv_2011.13925/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/p3_图文混排_5.pdf/preproc_out.json → tests/assets/pre_proc_results/p3_图文混排_5.pdf/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/p3_图文混排_6.pdf/preproc_out.json → tests/assets/pre_proc_results/p3_图文混排_6.pdf/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/p3_图文混排_84.pdf/preproc_out.json → tests/assets/pre_proc_results/p3_图文混排_84.pdf/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/scihub_10800000/preproc_out.json → tests/assets/pre_proc_results/scihub_10800000/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/scihub_46600000/preproc_out.json → tests/assets/pre_proc_results/scihub_46600000/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/scihub_60900000/preproc_out.json → tests/assets/pre_proc_results/scihub_60900000/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/scihub_76800000/preproc_out.json → tests/assets/pre_proc_results/scihub_76800000/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/the_eye_cdn_00412782/preproc_out.json → tests/assets/pre_proc_results/the_eye_cdn_00412782/preproc_out.json


+ 0 - 0
test/assets/pre_proc_results/中文单列_书籍_100247_4_装饰图片.pdf/preproc_out.json → tests/assets/pre_proc_results/中文单列_书籍_100247_4_装饰图片.pdf/preproc_out.json


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.