| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014 |
- import os
- import re
- import numpy as np
- from magic_pdf.libs.nlp_utils import NLPModels
- from magic_pdf.para.commons import *
- if sys.version_info[0] >= 3:
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
- class TitleProcessor:
- def __init__(self, *doc_statistics) -> None:
- if len(doc_statistics) > 0:
- self.doc_statistics = doc_statistics[0]
- self.nlp_model = NLPModels()
- self.MAX_TITLE_LEVEL = 3
- self.numbered_title_pattern = r"""
- ^ # 行首
- ( # 开始捕获组
- [\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1)
- |\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2)
- |[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A)
- |[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A)
- |[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I)
- |[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I)
- |\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1
- |[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、
- |[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一)
- |[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1
- |[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a)
- |[a-z]\)\s # 小写字母后跟右括号和空格,例如:a)
- |[A-Z]-\s # 大写字母后跟短横线和空格,例如:A-
- |\w+:\s # 英文序号词后跟冒号和空格,例如:First:
- |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
- |[IVXLCDM]+\. # 罗马数字后跟点,例如:I.
- |\d+\.\s # 单个数字后跟点和空格,例如:1.
- ) # 结束捕获组
- .+ # 标题的其余部分
- """
- def _is_potential_title(
- self,
- curr_line,
- prev_line,
- prev_line_is_title,
- next_line,
- avg_char_width,
- avg_char_height,
- median_font_size,
- ):
- """
- This function checks if the line is a potential title.
- Parameters
- ----------
- curr_line : dict
- current line
- prev_line : dict
- previous line
- next_line : dict
- next line
- avg_char_width : float
- average of char widths
- avg_char_height : float
- average of line heights
- Returns
- -------
- bool
- True if the line is a potential title, False otherwise.
- """
- def __is_line_centered(line_bbox, page_bbox, avg_char_width):
- """
- This function checks if the line is centered on the page
- Parameters
- ----------
- line_bbox : list
- bbox of the line
- page_bbox : list
- bbox of the page
- avg_char_width : float
- average of char widths
- Returns
- -------
- bool
- True if the line is centered on the page, False otherwise.
- """
- horizontal_ratio = 0.5
- horizontal_thres = horizontal_ratio * avg_char_width
- x0, _, x1, _ = line_bbox
- _, _, page_x1, _ = page_bbox
- return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
- def __is_bold_font_line(line):
- """
- Check if a line contains any bold font style.
- """
- def _is_bold_span(span):
- # if span text is empty or only contains space, return False
- if not span["text"].strip():
- return False
- return bool(span["flags"] & 2**4) # Check if the font is bold
- for span in line["spans"]:
- if not _is_bold_span(span):
- return False
- return True
- def __is_italic_font_line(line):
- """
- Check if a line contains any italic font style.
- """
- def __is_italic_span(span):
- return bool(span["flags"] & 2**1) # Check if the font is italic
- for span in line["spans"]:
- if not __is_italic_span(span):
- return False
- return True
- def __is_punctuation_heavy(line_text):
- """
- Check if the line contains a high ratio of punctuation marks, which may indicate
- that the line is not a title.
- Parameters:
- line_text (str): Text of the line.
- Returns:
- bool: True if the line is heavy with punctuation, False otherwise.
- """
- # Pattern for common title format like "X.Y. Title"
- pattern = r"\b\d+\.\d+\..*\b"
- # If the line matches the title format, return False
- if re.match(pattern, line_text.strip()):
- return False
- # Find all punctuation marks in the line
- punctuation_marks = re.findall(r"[^\w\s]", line_text)
- number_of_punctuation_marks = len(punctuation_marks)
- text_length = len(line_text)
- if text_length == 0:
- return False
- punctuation_ratio = number_of_punctuation_marks / text_length
- if punctuation_ratio >= 0.1:
- return True
- return False
- def __has_mixed_font_styles(spans, strict_mode=False):
- """
- This function checks if the line has mixed font styles, the strict mode will compare the font types
- Parameters
- ----------
- spans : list
- spans of the line
- strict_mode : bool
- True for strict mode, the font types will be fully compared
- False for non-strict mode, the font types will be compared by the most longest common prefix
- Returns
- -------
- bool
- True if the line has mixed font styles, False otherwise.
- """
- if strict_mode:
- font_styles = set()
- for span in spans:
- font_style = span["font"].lower()
- font_styles.add(font_style)
- return len(font_styles) > 1
- else: # non-strict mode
- font_styles = []
- for span in spans:
- font_style = span["font"].lower()
- font_styles.append(font_style)
- if len(font_styles) > 1:
- longest_common_prefix = os.path.commonprefix(font_styles)
- if len(longest_common_prefix) > 0:
- return False
- else:
- return True
- else:
- return False
- def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
- """
- This function checks if the current line has a different font type from the previous and next lines
- Parameters
- ----------
- curr_line_font_type : str
- font type of the current line
- prev_line_font_type : str
- font type of the previous line
- next_line_font_type : str
- font type of the next line
- Returns
- -------
- bool
- True if the current line has a different font type from the previous and next lines, False otherwise.
- """
- return all(
- curr_line_font_type != other_font_type.lower()
- for other_font_type in [prev_line_font_type, next_line_font_type]
- if other_font_type is not None
- )
- def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
- """
- This function checks if the current line has a larger font size than the previous and next lines
- Parameters
- ----------
- curr_line_font_size : float
- font size of the current line
- prev_line_font_size : float
- font size of the previous line
- next_line_font_size : float
- font size of the next line
- Returns
- -------
- bool
- True if the current line has a larger font size than the previous and next lines, False otherwise.
- """
- return all(
- curr_line_font_size > other_font_size * 1.2
- for other_font_size in [prev_line_font_size, next_line_font_size]
- if other_font_size is not None
- )
- def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
- """
- This function checks if the current line is similar to the previous line
- Parameters
- ----------
- curr_line : dict
- current line
- prev_line : dict
- previous line
- Returns
- -------
- bool
- True if the current line is similar to the previous line, False otherwise.
- """
- if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
- return True
- else:
- return False
- def __is_same_font_type_of_docAvg(curr_line_font_type):
- """
- This function checks if the current line has the same font type as the document average font type
- Parameters
- ----------
- curr_line_font_type : str
- font type of the current line
- Returns
- -------
- bool
- True if the current line has the same font type as the document average font type, False otherwise.
- """
- doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
- doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
- return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
- def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
- """
- This function checks if the current line has a large enough font size
- Parameters
- ----------
- curr_line_font_size : float
- font size of the current line
- ratio : float
- ratio of the current line font size to the document average font size
- Returns
- -------
- bool
- True if the current line has a large enough font size, False otherwise.
- """
- doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
- doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
- doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
- return curr_line_font_size >= doc_avg_font_size * ratio
- def __is_sufficient_spacing_above_and_below(
- curr_line_bbox,
- prev_line_bbox,
- next_line_bbox,
- avg_char_height,
- median_font_size,
- ):
- """
- This function checks if the current line has sufficient spacing above and below
- Parameters
- ----------
- curr_line_bbox : list
- bbox of the current line
- prev_line_bbox : list
- bbox of the previous line
- next_line_bbox : list
- bbox of the next line
- avg_char_width : float
- average of char widths
- avg_char_height : float
- average of line heights
- Returns
- -------
- bool
- True if the current line has sufficient spacing above and below, False otherwise.
- """
- vertical_ratio = 1.25
- vertical_thres = vertical_ratio * median_font_size
- _, y0, _, y1 = curr_line_bbox
- sufficient_spacing_above = False
- if prev_line_bbox:
- vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
- sufficient_spacing_above = vertical_spacing_above > vertical_thres
- else:
- sufficient_spacing_above = True
- sufficient_spacing_below = False
- if next_line_bbox:
- vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
- sufficient_spacing_below = vertical_spacing_below > vertical_thres
- else:
- sufficient_spacing_below = True
- return (sufficient_spacing_above, sufficient_spacing_below)
- def __is_word_list_line_by_rules(curr_line_text):
- """
- This function checks if the current line is a word list
- Parameters
- ----------
- curr_line_text : str
- text of the current line
- Returns
- -------
- bool
- True if the current line is a name list, False otherwise.
- """
- # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[,,;;\s]|$)"
- name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[,,;;\s]|$)"
- compiled_pattern = re.compile(name_list_pattern)
- if compiled_pattern.search(curr_line_text):
- return True
- else:
- return False
- # """
- def __get_text_catgr_by_nlp(curr_line_text):
- """
- This function checks if the current line is a name list using nlp model, such as spacy
- Parameters
- ----------
- curr_line_text : str
- text of the current line
- Returns
- -------
- bool
- True if the current line is a name list, False otherwise.
- """
- result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
- return result
- # """
- def __is_numbered_title(curr_line_text):
- """
- This function checks if the current line is a numbered list
- Parameters
- ----------
- curr_line_text : str
- text of the current line
- Returns
- -------
- bool
- True if the current line is a numbered list, False otherwise.
- """
- compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
- if compiled_pattern.search(curr_line_text):
- return True
- else:
- return False
- def __is_end_with_ending_puncs(line_text):
- """
- This function checks if the current line ends with a ending punctuation mark
- Parameters
- ----------
- line_text : str
- text of the current line
- Returns
- -------
- bool
- True if the current line ends with a punctuation mark, False otherwise.
- """
- end_puncs = [".", "?", "!", "。", "?", "!", "…"]
- line_text = line_text.rstrip()
- if line_text[-1] in end_puncs:
- return True
- return False
- def __contains_only_no_meaning_symbols(line_text):
- """
- This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
- Situation contains:
- 1. Only have punctuation marks
- 2. Only have other non-meaning symbols
- Parameters
- ----------
- line_text : str
- text of the current line
- Returns
- -------
- bool
- True if the current line contains only symbols that have no meaning, False otherwise.
- """
- punctuation_marks = re.findall(r"[^\w\s]", line_text) # find all punctuation marks
- number_of_punctuation_marks = len(punctuation_marks)
- text_length = len(line_text)
- if text_length == 0:
- return False
- punctuation_ratio = number_of_punctuation_marks / text_length
- if punctuation_ratio >= 0.9:
- return True
- return False
- def __is_equation(line_text):
- """
- This function checks if the current line is an equation.
- Parameters
- ----------
- line_text : str
- Returns
- -------
- bool
- True if the current line is an equation, False otherwise.
- """
- equation_reg = r"\$.*?\\overline.*?\$" # to match interline equations
- if re.search(equation_reg, line_text):
- return True
- else:
- return False
- def __is_title_by_len(text, max_length=200):
- """
- This function checks if the current line is a title by length.
- Parameters
- ----------
- text : str
- text of the current line
- max_length : int
- max length of the title
- Returns
- -------
- bool
- True if the current line is a title, False otherwise.
- """
- text = text.strip()
- return len(text) <= max_length
- def __compute_line_font_type_and_size(curr_line):
- """
- This function computes the font type and font size of the line.
- Parameters
- ----------
- line : dict
- line
- Returns
- -------
- font_type : str
- font type of the line
- font_size : float
- font size of the line
- """
- spans = curr_line["spans"]
- max_accumulated_length = 0
- max_span_font_size = curr_line["spans"][0]["size"] # default value, float type
- max_span_font_type = curr_line["spans"][0]["font"].lower() # default value, string type
- for span in spans:
- if span["text"].isspace():
- continue
- span_length = span["bbox"][2] - span["bbox"][0]
- if span_length > max_accumulated_length:
- max_accumulated_length = span_length
- max_span_font_size = span["size"]
- max_span_font_type = span["font"].lower()
- return max_span_font_type, max_span_font_size
- """
- Title detecting main Process.
- """
- """
- Basic features about the current line.
- """
- curr_line_bbox = curr_line["bbox"]
- curr_line_text = curr_line["text"]
- curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
- if len(curr_line_text.strip()) == 0: # skip empty lines
- return False
- prev_line_bbox = prev_line["bbox"] if prev_line else None
- if prev_line:
- prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
- else:
- prev_line_font_type, prev_line_font_size = None, None
- next_line_bbox = next_line["bbox"] if next_line else None
- if next_line:
- next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
- else:
- next_line_font_type, next_line_font_size = None, None
- """
- Aggregated features about the current line.
- """
- is_italc_font = __is_italic_font_line(curr_line)
- is_bold_font = __is_bold_font_line(curr_line)
- is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
- is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
- is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
- is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
- is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
- is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
- is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
- is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
- is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
- is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
- is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
- curr_line_font_size, prev_line_font_size, next_line_font_size
- )
- is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
- curr_line_font_type, prev_line_font_type, next_line_font_type
- )
- has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
- curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
- )
- is_similar_to_pre_line = __is_similar_to_pre_line(
- curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
- )
- """
- Further aggregated features about the current line.
-
- Attention:
- Features that start with __ are for internal use.
- """
- __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
- curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
- )
- __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
- is_a_left_inline_title = (
- is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
- )
- is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
- is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
- is_title_by_check_pre_and_next_line = (
- (prev_line is not None or next_line is not None)
- and has_sufficient_spaces_above
- and has_sufficient_spaces_below
- and is_potential_title_font
- )
- is_numbered_title = __is_numbered_title(curr_line_text) and (
- (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
- )
- is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
- is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
- is_equation = __is_equation(curr_line_text)
- is_title_by_len = __is_title_by_len(curr_line_text)
- """
- Decide if the line is a title.
- """
- # is_title = False
- # if prev_line_is_title:
- is_title = (
- is_not_end_with_ending_puncs # not end with ending punctuation marks
- and is_not_only_no_meaning_symbols # not only have no meaning symbols
- and is_title_by_len # is a title by length, default max length is 200
- and not is_equation # an interline equation should never be a title
- and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
- and (
- (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
- or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
- or (
- is_much_larger_font_than_doc_avg
- and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
- )
- or (
- is_font_size_little_less_than_doc_avg
- and is_bold_font
- and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
- )
- ) # not the same font type as the document average font type, which includes the most common font type and the second most common font type
- and (
- (
- not is_person_or_org_list_line_by_nlp
- and (
- is_much_larger_font_than_doc_avg
- or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
- )
- )
- or (
- not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
- and not is_a_left_inline_title
- and not is_punctuation_heavy
- and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
- )
- or (
- is_person_or_org_list_line_by_nlp
- and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
- and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
- )
- or (is_numbered_title and not is_a_left_inline_title)
- )
- )
- # ) or (is_similar_to_pre_line and prev_line_is_title)
- is_name_or_org_list_to_be_removed = (
- (is_person_or_org_list_line_by_nlp)
- and is_punctuation_heavy
- and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
- ) and not is_title
- if is_name_or_org_list_to_be_removed:
- is_author_or_org_list = True
- # print curr_line_text to check
- # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
- else:
- is_author_or_org_list = False
- """
- # print reason why the line is a title
- if is_title:
- print_green("This line is a title.")
- print_green("↓" * 10)
- print()
- print("curr_line_text: ", curr_line_text)
- print()
- # print reason why the line is not a title
- line_text = curr_line_text.strip()
- test_text = "Career/Personal Life"
- text_content_condition = line_text == test_text
-
- if not is_title and text_content_condition: # Print specific line
- # if not is_title: # Print each line
- print_red("This line is not a title.")
- print_red("↓" * 10)
- print()
- print("curr_line_text: ", curr_line_text)
- print()
- if is_not_end_with_ending_puncs:
- print_green(f"is_not_end_with_ending_puncs")
- else:
- print_red(f"is_end_with_ending_puncs")
- if is_not_only_no_meaning_symbols:
- print_green(f"is_not_only_no_meaning_symbols")
- else:
- print_red(f"is_only_no_meaning_symbols")
- if is_title_by_len:
- print_green(f"is_title_by_len: {is_title_by_len}")
- else:
- print_red(f"is_not_title_by_len: {is_title_by_len}")
- if is_equation:
- print_red(f"is_equation")
- else:
- print_green(f"is_not_equation")
- if is_potential_title_font:
- print_green(f"is_potential_title_font")
- else:
- print_red(f"is_not_potential_title_font")
- if is_punctuation_heavy:
- print_red("is_punctuation_heavy")
- else:
- print_green("is_not_punctuation_heavy")
- if is_bold_font:
- print_green(f"is_bold_font")
- else:
- print_red(f"is_not_bold_font")
- if is_font_size_not_less_than_doc_avg:
- print_green(f"is_larger_font_than_doc_avg")
- else:
- print_red(f"is_not_larger_font_than_doc_avg")
- if is_much_larger_font_than_doc_avg:
- print_green(f"is_much_larger_font_than_doc_avg")
- else:
- print_red(f"is_not_much_larger_font_than_doc_avg")
- if is_not_same_font_type_of_docAvg:
- print_green(f"is_not_same_font_type_of_docAvg")
- else:
- print_red(f"is_same_font_type_of_docAvg")
- if is_word_list_line_by_rules:
- print_red("is_word_list_line_by_rules")
- else:
- print_green("is_not_name_list_by_rules")
- if is_person_or_org_list_line_by_nlp:
- print_red("is_person_or_org_list_line_by_nlp")
- else:
- print_green("is_not_person_or_org_list_line_by_nlp")
- if not is_numbered_title:
- print_red("is_not_numbered_title")
- else:
- print_green("is_numbered_title")
- if is_a_left_inline_title:
- print_red("is_a_left_inline_title")
- else:
- print_green("is_not_a_left_inline_title")
- if not is_title_by_check_prev_line:
- print_red("is_not_title_by_check_prev_line")
- else:
- print_green("is_title_by_check_prev_line")
- if not is_title_by_check_next_line:
- print_red("is_not_title_by_check_next_line")
- else:
- print_green("is_title_by_check_next_line")
- if not is_title_by_check_pre_and_next_line:
- print_red("is_not_title_by_check_pre_and_next_line")
- else:
- print_green("is_title_by_check_pre_and_next_line")
- # print_green("Common features:")
- # print_green("↓" * 10)
- # print(f" curr_line_font_type: {curr_line_font_type}")
- # print(f" curr_line_font_size: {curr_line_font_size}")
- # print()
- """
- return is_title, is_author_or_org_list
- def _detect_block_title(self, input_block):
- """
- Use the functions 'is_potential_title' to detect titles of each paragraph block.
- If a line is a title, then the value of key 'is_title' of the line will be set to True.
- """
- raw_lines = input_block["lines"]
- prev_line_is_title_flag = False
- for i, curr_line in enumerate(raw_lines):
- prev_line = raw_lines[i - 1] if i > 0 else None
- next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
- blk_avg_char_width = input_block["avg_char_width"]
- blk_avg_char_height = input_block["avg_char_height"]
- blk_media_font_size = input_block["median_font_size"]
- is_title, is_author_or_org_list = self._is_potential_title(
- curr_line,
- prev_line,
- prev_line_is_title_flag,
- next_line,
- blk_avg_char_width,
- blk_avg_char_height,
- blk_media_font_size,
- )
- if is_title:
- curr_line["is_title"] = is_title
- prev_line_is_title_flag = True
- else:
- curr_line["is_title"] = False
- prev_line_is_title_flag = False
- if is_author_or_org_list:
- curr_line["is_author_or_org_list"] = is_author_or_org_list
- else:
- curr_line["is_author_or_org_list"] = False
- return input_block
- def batch_process_blocks_detect_titles(self, pdf_dic):
- """
- This function batch process the blocks to detect titles.
- Parameters
- ----------
- pdf_dict : dict
- result dictionary
- Returns
- -------
- pdf_dict : dict
- result dictionary
- """
- num_titles = 0
- for page_id, blocks in pdf_dic.items():
- if page_id.startswith("page_"):
- para_blocks = []
- if "para_blocks" in blocks.keys():
- para_blocks = blocks["para_blocks"]
- all_single_line_blocks = []
- for block in para_blocks:
- if len(block["lines"]) == 1:
- all_single_line_blocks.append(block)
- new_para_blocks = []
- if not len(all_single_line_blocks) == len(para_blocks): # Not all blocks are single line blocks.
- for para_block in para_blocks:
- new_block = self._detect_block_title(para_block)
- new_para_blocks.append(new_block)
- num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
- else: # All blocks are single line blocks.
- for para_block in para_blocks:
- new_para_blocks.append(para_block)
- num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
- para_blocks = new_para_blocks
- blocks["para_blocks"] = para_blocks
- for para_block in para_blocks:
- all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
- para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
- if (
- all_titles and para_text_len < 200
- ): # total length of the paragraph is less than 200, more than this should not be a title
- para_block["is_block_title"] = 1
- else:
- para_block["is_block_title"] = 0
- all_name_or_org_list_to_be_removed = all(
- safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
- )
- if all_name_or_org_list_to_be_removed and page_id == "page_0":
- para_block["is_block_an_author_or_org_list"] = 1
- else:
- para_block["is_block_an_author_or_org_list"] = 0
- pdf_dic["statistics"]["num_titles"] = num_titles
- return pdf_dic
- def __determine_size_based_level(self, title_blocks):
- """
- This function determines the title level based on the font size of the title.
- Parameters
- ----------
- title_blocks : list
- Returns
- -------
- title_blocks : list
- """
- font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
- # Use the mean and std of font sizes to remove extreme values
- mean_font_size = np.mean(font_sizes)
- std_font_size = np.std(font_sizes)
- min_extreme_font_size = mean_font_size - std_font_size # type: ignore
- max_extreme_font_size = mean_font_size + std_font_size # type: ignore
- # Compute the threshold for title level
- middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
- if middle_font_sizes.size > 0:
- middle_mean_font_size = np.mean(middle_font_sizes)
- level_threshold = middle_mean_font_size
- else:
- level_threshold = mean_font_size
- for tb in title_blocks:
- title_block = tb["block"]
- title_font_size = safe_get(title_block, "block_font_size", 0)
- current_level = 1 # Initialize title level, the biggest level is 1
- # print(f"Before adjustment by font size, {current_level}")
- if title_font_size >= max_extreme_font_size:
- current_level = 1
- elif title_font_size <= min_extreme_font_size:
- current_level = 3
- elif float(title_font_size) >= float(level_threshold):
- current_level = 2
- else:
- current_level = 3
- # print(f"After adjustment by font size, {current_level}")
- title_block["block_title_level"] = current_level
- return title_blocks
- def batch_process_blocks_recog_title_level(self, pdf_dic):
- title_blocks = []
- # Collect all titles
- for page_id, blocks in pdf_dic.items():
- if page_id.startswith("page_"):
- para_blocks = blocks.get("para_blocks", [])
- for block in para_blocks:
- if block.get("is_block_title"):
- title_obj = {"page_id": page_id, "block": block}
- title_blocks.append(title_obj)
- # Determine title level
- if title_blocks:
- # Determine title level based on font size
- title_blocks = self.__determine_size_based_level(title_blocks)
- return pdf_dic
|