| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- XYCUT_SETTINGS = {
- "child_block_overlap_ratio_threshold": 0.1,
- "edge_distance_compare_tolerance_len": 2,
- "distance_weight_map": {
- "edge_weight": 10**4,
- "up_edge_weight": 1,
- "down_edge_weight": 0.0001,
- },
- }
- REGION_SETTINGS = {
- "match_block_overlap_ratio_threshold": 0.6,
- "split_block_overlap_ratio_threshold": 0.4,
- }
- BLOCK_SETTINGS = {
- "title_conversion_area_ratio_threshold": 0.3, # update paragraph_title -> doc_title
- }
- LINE_SETTINGS = {
- "line_height_iou_threshold": 0.6, # For line segmentation of OCR results
- "delimiter_map": {
- "doc_title": " ",
- "content": "\n",
- },
- }
- BLOCK_LABEL_MAP = {
- "doc_title_labels": ["doc_title"], # 文档标题
- "paragraph_title_labels": [
- "paragraph_title",
- "abstract_title",
- "reference_title",
- "content_title",
- ], # 段落标题
- "vision_labels": [
- "image",
- "table",
- "chart",
- "flowchart",
- "figure",
- ], # 图、表、印章、图表、图
- "vision_title_labels": ["table_title", "chart_title", "figure_title"], # 图表标题
- "unordered_labels": [
- "aside_text",
- "seal",
- "number",
- "formula_number",
- ],
- "text_labels": ["text"],
- "header_labels": ["header", "header_image"],
- "footer_labels": ["footer", "footer_image", "footnote"],
- "visualize_index_labels": [
- "text",
- "formula",
- "algorithm",
- "reference",
- "content",
- "abstract",
- "paragraph_title",
- "doc_title",
- "abstract_title",
- "refer_title",
- "content_title",
- ],
- }
|