setting.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. XYCUT_SETTINGS = {
  15. "child_block_overlap_ratio_threshold": 0.1,
  16. "edge_distance_compare_tolerance_len": 2,
  17. "distance_weight_map": {
  18. "edge_weight": 10**4,
  19. "up_edge_weight": 1,
  20. "down_edge_weight": 0.0001,
  21. },
  22. }
  23. REGION_SETTINGS = {
  24. "match_block_overlap_ratio_threshold": 0.6,
  25. "split_block_overlap_ratio_threshold": 0.4,
  26. }
  27. BLOCK_SETTINGS = {
  28. "title_conversion_area_ratio_threshold": 0.3, # update paragraph_title -> doc_title
  29. }
  30. LINE_SETTINGS = {
  31. "line_height_iou_threshold": 0.6, # For line segmentation of OCR results
  32. "delimiter_map": {
  33. "doc_title": " ",
  34. "content": "\n",
  35. },
  36. }
  37. BLOCK_LABEL_MAP = {
  38. "doc_title_labels": ["doc_title"], # 文档标题
  39. "paragraph_title_labels": [
  40. "paragraph_title",
  41. "abstract_title",
  42. "reference_title",
  43. "content_title",
  44. ], # 段落标题
  45. "vision_labels": [
  46. "image",
  47. "table",
  48. "chart",
  49. "flowchart",
  50. "figure",
  51. ], # 图、表、印章、图表、图
  52. "vision_title_labels": ["table_title", "chart_title", "figure_title"], # 图表标题
  53. "unordered_labels": [
  54. "aside_text",
  55. "seal",
  56. "number",
  57. "formula_number",
  58. ],
  59. "text_labels": ["text"],
  60. "header_labels": ["header", "header_image"],
  61. "footer_labels": ["footer", "footer_image", "footnote"],
  62. "visualize_index_labels": [
  63. "text",
  64. "formula",
  65. "algorithm",
  66. "reference",
  67. "content",
  68. "abstract",
  69. "paragraph_title",
  70. "doc_title",
  71. "abstract_title",
  72. "refer_title",
  73. "content_title",
  74. ],
  75. }