setting.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. XYCUT_SETTINGS = {
  15. "child_block_overlap_ratio_threshold": 0.1,
  16. "edge_distance_compare_tolerance_len": 2,
  17. "distance_weight_map": {
  18. "edge_weight": 10**4,
  19. "up_edge_weight": 1,
  20. "down_edge_weight": 0.0001,
  21. },
  22. "cross_layout_ref_text_block_words_num_threshold": 10,
  23. }
  24. REGION_SETTINGS = {
  25. "match_block_overlap_ratio_threshold": 0.6,
  26. "split_block_overlap_ratio_threshold": 0.4,
  27. }
  28. BLOCK_SETTINGS = {
  29. "title_conversion_area_ratio_threshold": 0.3, # update paragraph_title -> doc_title
  30. }
  31. LINE_SETTINGS = {
  32. "line_height_iou_threshold": 0.6, # For line segmentation of OCR results
  33. "delimiter_map": {
  34. "doc_title": " ",
  35. "content": "\n",
  36. },
  37. }
  38. BLOCK_LABEL_MAP = {
  39. "doc_title_labels": ["doc_title"], # 文档标题
  40. "paragraph_title_labels": [
  41. "paragraph_title",
  42. "abstract_title",
  43. "reference_title",
  44. "content_title",
  45. ], # 段落标题
  46. "vision_labels": [
  47. "image",
  48. "table",
  49. "chart",
  50. "flowchart",
  51. "figure",
  52. ], # 图、表、印章、图表、图
  53. "vision_title_labels": [
  54. "table_title",
  55. "chart_title",
  56. "figure_title",
  57. "figure_table_chart_title",
  58. ], # 图表标题
  59. "unordered_labels": [
  60. "aside_text",
  61. "seal",
  62. "number",
  63. "formula_number",
  64. ],
  65. "text_labels": ["text"],
  66. "header_labels": ["header", "header_image"],
  67. "footer_labels": ["footer", "footer_image", "footnote"],
  68. "visualize_index_labels": [
  69. "text",
  70. "formula",
  71. "algorithm",
  72. "reference",
  73. "content",
  74. "abstract",
  75. "paragraph_title",
  76. "doc_title",
  77. "abstract_title",
  78. "refer_title",
  79. "content_title",
  80. ],
  81. "image_labels": ["image", "figure"],
  82. }