zhengchun
/
MinerU


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014
							import os
import re
import numpy as np

from magic_pdf.libs.nlp_utils import NLPModels

from magic_pdf.para.commons import *

if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class TitleProcessor:
    def __init__(self, *doc_statistics) -> None:
        if len(doc_statistics) > 0:
            self.doc_statistics = doc_statistics[0]

        self.nlp_model = NLPModels()
        self.MAX_TITLE_LEVEL = 3
        self.numbered_title_pattern = r"""
            ^                                 # 行首
            (                                 # 开始捕获组
                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
            )                                 # 结束捕获组
            .+                                # 标题的其余部分
        """

    def _is_potential_title(
        self,
        curr_line,
        prev_line,
        prev_line_is_title,
        next_line,
        avg_char_width,
        avg_char_height,
        median_font_size,
    ):
        """
        This function checks if the line is a potential title.

        Parameters
        ----------
        curr_line : dict
            current line
        prev_line : dict
            previous line
        next_line : dict
            next line
        avg_char_width : float
            average of char widths
        avg_char_height : float
            average of line heights

        Returns
        -------
        bool
            True if the line is a potential title, False otherwise.
        """

        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
            """
            This function checks if the line is centered on the page

            Parameters
            ----------
            line_bbox : list
                bbox of the line
            page_bbox : list
                bbox of the page
            avg_char_width : float
                average of char widths

            Returns
            -------
            bool
                True if the line is centered on the page, False otherwise.
            """
            horizontal_ratio = 0.5
            horizontal_thres = horizontal_ratio * avg_char_width

            x0, _, x1, _ = line_bbox
            _, _, page_x1, _ = page_bbox

            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres

        def __is_bold_font_line(line):
            """
            Check if a line contains any bold font style.
            """

            def _is_bold_span(span):
                # if span text is empty or only contains space, return False
                if not span["text"].strip():
                    return False

                return bool(span["flags"] & 2**4)  # Check if the font is bold

            for span in line["spans"]:
                if not _is_bold_span(span):
                    return False

            return True

        def __is_italic_font_line(line):
            """
            Check if a line contains any italic font style.
            """

            def __is_italic_span(span):
                return bool(span["flags"] & 2**1)  # Check if the font is italic

            for span in line["spans"]:
                if not __is_italic_span(span):
                    return False

            return True

        def __is_punctuation_heavy(line_text):
            """
            Check if the line contains a high ratio of punctuation marks, which may indicate
            that the line is not a title.

            Parameters:
            line_text (str): Text of the line.

            Returns:
            bool: True if the line is heavy with punctuation, False otherwise.
            """
            # Pattern for common title format like "X.Y. Title"
            pattern = r"\b\d+\.\d+\..*\b"

            # If the line matches the title format, return False
            if re.match(pattern, line_text.strip()):
                return False

            # Find all punctuation marks in the line
            punctuation_marks = re.findall(r"[^\w\s]", line_text)
            number_of_punctuation_marks = len(punctuation_marks)

            text_length = len(line_text)

            if text_length == 0:
                return False

            punctuation_ratio = number_of_punctuation_marks / text_length
            if punctuation_ratio >= 0.1:
                return True

            return False

        def __has_mixed_font_styles(spans, strict_mode=False):
            """
            This function checks if the line has mixed font styles, the strict mode will compare the font types

            Parameters
            ----------
            spans : list
                spans of the line
            strict_mode : bool
                True for strict mode, the font types will be fully compared
                False for non-strict mode, the font types will be compared by the most longest common prefix

            Returns
            -------
            bool
                True if the line has mixed font styles, False otherwise.
            """
            if strict_mode:
                font_styles = set()
                for span in spans:
                    font_style = span["font"].lower()
                    font_styles.add(font_style)

                return len(font_styles) > 1

            else:  # non-strict mode
                font_styles = []
                for span in spans:
                    font_style = span["font"].lower()
                    font_styles.append(font_style)

                if len(font_styles) > 1:
                    longest_common_prefix = os.path.commonprefix(font_styles)
                    if len(longest_common_prefix) > 0:
                        return False
                    else:
                        return True
                else:
                    return False

        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
            """
            This function checks if the current line has a different font type from the previous and next lines

            Parameters
            ----------
            curr_line_font_type : str
                font type of the current line
            prev_line_font_type : str
                font type of the previous line
            next_line_font_type : str
                font type of the next line

            Returns
            -------
            bool
                True if the current line has a different font type from the previous and next lines, False otherwise.
            """
            return all(
                curr_line_font_type != other_font_type.lower()
                for other_font_type in [prev_line_font_type, next_line_font_type]
                if other_font_type is not None
            )

        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
            """
            This function checks if the current line has a larger font size than the previous and next lines

            Parameters
            ----------
            curr_line_font_size : float
                font size of the current line
            prev_line_font_size : float
                font size of the previous line
            next_line_font_size : float
                font size of the next line

            Returns
            -------
            bool
                True if the current line has a larger font size than the previous and next lines, False otherwise.
            """
            return all(
                curr_line_font_size > other_font_size * 1.2
                for other_font_size in [prev_line_font_size, next_line_font_size]
                if other_font_size is not None
            )

        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
            """
            This function checks if the current line is similar to the previous line

            Parameters
            ----------
            curr_line : dict
                current line
            prev_line : dict
                previous line

            Returns
            -------
            bool
                True if the current line is similar to the previous line, False otherwise.
            """

            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
                return True
            else:
                return False

        def __is_same_font_type_of_docAvg(curr_line_font_type):
            """
            This function checks if the current line has the same font type as the document average font type

            Parameters
            ----------
            curr_line_font_type : str
                font type of the current line

            Returns
            -------
            bool
                True if the current line has the same font type as the document average font type, False otherwise.
            """
            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()

            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]

        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
            """
            This function checks if the current line has a large enough font size

            Parameters
            ----------
            curr_line_font_size : float
                font size of the current line
            ratio : float
                ratio of the current line font size to the document average font size

            Returns
            -------
            bool
                True if the current line has a large enough font size, False otherwise.
            """
            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)

            return curr_line_font_size >= doc_avg_font_size * ratio

        def __is_sufficient_spacing_above_and_below(
            curr_line_bbox,
            prev_line_bbox,
            next_line_bbox,
            avg_char_height,
            median_font_size,
        ):
            """
            This function checks if the current line has sufficient spacing above and below

            Parameters
            ----------
            curr_line_bbox : list
                bbox of the current line
            prev_line_bbox : list
                bbox of the previous line
            next_line_bbox : list
                bbox of the next line
            avg_char_width : float
                average of char widths
            avg_char_height : float
                average of line heights

            Returns
            -------
            bool
                True if the current line has sufficient spacing above and below, False otherwise.
            """
            vertical_ratio = 1.25
            vertical_thres = vertical_ratio * median_font_size

            _, y0, _, y1 = curr_line_bbox

            sufficient_spacing_above = False
            if prev_line_bbox:
                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
                sufficient_spacing_above = vertical_spacing_above > vertical_thres
            else:
                sufficient_spacing_above = True

            sufficient_spacing_below = False
            if next_line_bbox:
                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
                sufficient_spacing_below = vertical_spacing_below > vertical_thres
            else:
                sufficient_spacing_below = True

            return (sufficient_spacing_above, sufficient_spacing_below)

        def __is_word_list_line_by_rules(curr_line_text):
            """
            This function checks if the current line is a word list

            Parameters
            ----------
            curr_line_text : str
                text of the current line

            Returns
            -------
            bool
                True if the current line is a name list, False otherwise.
            """
            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"

            compiled_pattern = re.compile(name_list_pattern)

            if compiled_pattern.search(curr_line_text):
                return True
            else:
                return False

        # """
        def __get_text_catgr_by_nlp(curr_line_text):
            """
            This function checks if the current line is a name list using nlp model, such as spacy

            Parameters
            ----------
            curr_line_text : str
                text of the current line

            Returns
            -------
            bool
                True if the current line is a name list, False otherwise.
            """

            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)

            return result

        # """

        def __is_numbered_title(curr_line_text):
            """
            This function checks if the current line is a numbered list

            Parameters
            ----------
            curr_line_text : str
                text of the current line

            Returns
            -------
            bool
                True if the current line is a numbered list, False otherwise.
            """

            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)

            if compiled_pattern.search(curr_line_text):
                return True
            else:
                return False

        def __is_end_with_ending_puncs(line_text):
            """
            This function checks if the current line ends with a ending punctuation mark

            Parameters
            ----------
            line_text : str
                text of the current line

            Returns
            -------
            bool
                True if the current line ends with a punctuation mark, False otherwise.
            """
            end_puncs = [".", "?", "!", "。", "？", "！", "…"]

            line_text = line_text.rstrip()
            if line_text[-1] in end_puncs:
                return True

            return False

        def __contains_only_no_meaning_symbols(line_text):
            """
            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
            Situation contains:
            1. Only have punctuation marks
            2. Only have other non-meaning symbols

            Parameters
            ----------
            line_text : str
                text of the current line

            Returns
            -------
            bool
                True if the current line contains only symbols that have no meaning, False otherwise.
            """

            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
            number_of_punctuation_marks = len(punctuation_marks)

            text_length = len(line_text)

            if text_length == 0:
                return False

            punctuation_ratio = number_of_punctuation_marks / text_length
            if punctuation_ratio >= 0.9:
                return True

            return False

        def __is_equation(line_text):
            """
            This function checks if the current line is an equation.

            Parameters
            ----------
            line_text : str

            Returns
            -------
            bool
                True if the current line is an equation, False otherwise.
            """
            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations

            if re.search(equation_reg, line_text):
                return True
            else:
                return False

        def __is_title_by_len(text, max_length=200):
            """
            This function checks if the current line is a title by length.

            Parameters
            ----------
            text : str
                text of the current line

            max_length : int
                max length of the title

            Returns
            -------
            bool
                True if the current line is a title, False otherwise.

            """
            text = text.strip()
            return len(text) <= max_length

        def __compute_line_font_type_and_size(curr_line):
            """
            This function computes the font type and font size of the line.

            Parameters
            ----------
            line : dict
                line

            Returns
            -------
            font_type : str
                font type of the line
            font_size : float
                font size of the line
            """
            spans = curr_line["spans"]
            max_accumulated_length = 0
            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
            for span in spans:
                if span["text"].isspace():
                    continue
                span_length = span["bbox"][2] - span["bbox"][0]
                if span_length > max_accumulated_length:
                    max_accumulated_length = span_length
                    max_span_font_size = span["size"]
                    max_span_font_type = span["font"].lower()

            return max_span_font_type, max_span_font_size

        """
        Title detecting main Process.
        """

        """
        Basic features about the current line.
        """
        curr_line_bbox = curr_line["bbox"]
        curr_line_text = curr_line["text"]
        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)

        if len(curr_line_text.strip()) == 0:  # skip empty lines
            return False

        prev_line_bbox = prev_line["bbox"] if prev_line else None
        if prev_line:
            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
        else:
            prev_line_font_type, prev_line_font_size = None, None

        next_line_bbox = next_line["bbox"] if next_line else None
        if next_line:
            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
        else:
            next_line_font_type, next_line_font_size = None, None

        """
        Aggregated features about the current line.
        """
        is_italc_font = __is_italic_font_line(curr_line)
        is_bold_font = __is_bold_font_line(curr_line)

        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)

        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)

        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg

        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)

        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)

        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]

        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
            curr_line_font_size, prev_line_font_size, next_line_font_size
        )

        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
            curr_line_font_type, prev_line_font_type, next_line_font_type
        )

        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
        )

        is_similar_to_pre_line = __is_similar_to_pre_line(
            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
        )

        """
        Further aggregated features about the current line.
        
        Attention:
            Features that start with __ are for internal use.
        """

        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
        )
        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
        is_a_left_inline_title = (
            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
        )

        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font

        is_title_by_check_pre_and_next_line = (
            (prev_line is not None or next_line is not None)
            and has_sufficient_spaces_above
            and has_sufficient_spaces_below
            and is_potential_title_font
        )

        is_numbered_title = __is_numbered_title(curr_line_text) and (
            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
        )

        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)

        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)

        is_equation = __is_equation(curr_line_text)

        is_title_by_len = __is_title_by_len(curr_line_text)

        """
        Decide if the line is a title.
        """
        # is_title = False
        # if prev_line_is_title:

        is_title = (
            is_not_end_with_ending_puncs  # not end with ending punctuation marks
            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
            and is_title_by_len  # is a title by length, default max length is 200
            and not is_equation  # an interline equation should never be a title
            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
            and (
                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
                or (
                    is_much_larger_font_than_doc_avg
                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
                )
                or (
                    is_font_size_little_less_than_doc_avg
                    and is_bold_font
                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
                )
            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
            and (
                (
                    not is_person_or_org_list_line_by_nlp
                    and (
                        is_much_larger_font_than_doc_avg
                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
                    )
                )
                or (
                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
                    and not is_a_left_inline_title
                    and not is_punctuation_heavy
                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
                )
                or (
                    is_person_or_org_list_line_by_nlp
                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
                )
                or (is_numbered_title and not is_a_left_inline_title)
            )
        )
        # ) or (is_similar_to_pre_line and prev_line_is_title)

        is_name_or_org_list_to_be_removed = (
            (is_person_or_org_list_line_by_nlp)
            and is_punctuation_heavy
            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
        ) and not is_title

        if is_name_or_org_list_to_be_removed:
            is_author_or_org_list = True
            # print curr_line_text to check
            # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
        else:
            is_author_or_org_list = False
        """
        # print reason why the line is a title
        if is_title:
            print_green("This line is a title.")
            print_green("↓" * 10)
            print()
            print("curr_line_text: ", curr_line_text)
            print()

        # print reason why the line is not a title
        line_text = curr_line_text.strip()
        test_text = "Career/Personal Life"
        text_content_condition = line_text == test_text
        
        if not is_title and text_content_condition: # Print specific line
        # if not is_title: # Print each line
            print_red("This line is not a title.")
            print_red("↓" * 10)

            print()
            print("curr_line_text: ", curr_line_text)
            print()

            if is_not_end_with_ending_puncs:
                print_green(f"is_not_end_with_ending_puncs")
            else:
                print_red(f"is_end_with_ending_puncs")

            if is_not_only_no_meaning_symbols:
                print_green(f"is_not_only_no_meaning_symbols")
            else:
                print_red(f"is_only_no_meaning_symbols")

            if is_title_by_len:
                print_green(f"is_title_by_len: {is_title_by_len}")
            else:
                print_red(f"is_not_title_by_len: {is_title_by_len}")

            if is_equation:
                print_red(f"is_equation")
            else:
                print_green(f"is_not_equation")

            if is_potential_title_font:
                print_green(f"is_potential_title_font")
            else:
                print_red(f"is_not_potential_title_font")

            if is_punctuation_heavy:
                print_red("is_punctuation_heavy")
            else:
                print_green("is_not_punctuation_heavy")

            if is_bold_font:
                print_green(f"is_bold_font")
            else:
                print_red(f"is_not_bold_font")

            if is_font_size_not_less_than_doc_avg:
                print_green(f"is_larger_font_than_doc_avg")
            else:
                print_red(f"is_not_larger_font_than_doc_avg")

            if is_much_larger_font_than_doc_avg:
                print_green(f"is_much_larger_font_than_doc_avg")
            else:
                print_red(f"is_not_much_larger_font_than_doc_avg")

            if is_not_same_font_type_of_docAvg:
                print_green(f"is_not_same_font_type_of_docAvg")
            else:
                print_red(f"is_same_font_type_of_docAvg")

            if is_word_list_line_by_rules:
                print_red("is_word_list_line_by_rules")
            else:
                print_green("is_not_name_list_by_rules")

            if is_person_or_org_list_line_by_nlp:
                print_red("is_person_or_org_list_line_by_nlp")
            else:
                print_green("is_not_person_or_org_list_line_by_nlp")

            if not is_numbered_title:
                print_red("is_not_numbered_title")
            else:
                print_green("is_numbered_title")

            if is_a_left_inline_title:
                print_red("is_a_left_inline_title")
            else:
                print_green("is_not_a_left_inline_title")

            if not is_title_by_check_prev_line:
                print_red("is_not_title_by_check_prev_line")
            else:
                print_green("is_title_by_check_prev_line")

            if not is_title_by_check_next_line:
                print_red("is_not_title_by_check_next_line")
            else:
                print_green("is_title_by_check_next_line")

            if not is_title_by_check_pre_and_next_line:
                print_red("is_not_title_by_check_pre_and_next_line")
            else:
                print_green("is_title_by_check_pre_and_next_line")

        # print_green("Common features:")
        # print_green("↓" * 10)

        # print(f"    curr_line_font_type: {curr_line_font_type}")
        # print(f"    curr_line_font_size: {curr_line_font_size}")
        # print()

        """

        return is_title, is_author_or_org_list

    def _detect_block_title(self, input_block):
        """
        Use the functions 'is_potential_title' to detect titles of each paragraph block.
        If a line is a title, then the value of key 'is_title' of the line will be set to True.
        """

        raw_lines = input_block["lines"]

        prev_line_is_title_flag = False

        for i, curr_line in enumerate(raw_lines):
            prev_line = raw_lines[i - 1] if i > 0 else None
            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None

            blk_avg_char_width = input_block["avg_char_width"]
            blk_avg_char_height = input_block["avg_char_height"]
            blk_media_font_size = input_block["median_font_size"]

            is_title, is_author_or_org_list = self._is_potential_title(
                curr_line,
                prev_line,
                prev_line_is_title_flag,
                next_line,
                blk_avg_char_width,
                blk_avg_char_height,
                blk_media_font_size,
            )

            if is_title:
                curr_line["is_title"] = is_title
                prev_line_is_title_flag = True
            else:
                curr_line["is_title"] = False
                prev_line_is_title_flag = False

            if is_author_or_org_list:
                curr_line["is_author_or_org_list"] = is_author_or_org_list
            else:
                curr_line["is_author_or_org_list"] = False

        return input_block

    def batch_process_blocks_detect_titles(self, pdf_dic):
        """
        This function batch process the blocks to detect titles.

        Parameters
        ----------
        pdf_dict : dict
            result dictionary

        Returns
        -------
        pdf_dict : dict
            result dictionary
        """
        num_titles = 0

        for page_id, blocks in pdf_dic.items():
            if page_id.startswith("page_"):
                para_blocks = []
                if "para_blocks" in blocks.keys():
                    para_blocks = blocks["para_blocks"]

                    all_single_line_blocks = []
                    for block in para_blocks:
                        if len(block["lines"]) == 1:
                            all_single_line_blocks.append(block)

                    new_para_blocks = []
                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
                        for para_block in para_blocks:
                            new_block = self._detect_block_title(para_block)
                            new_para_blocks.append(new_block)
                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
                    else:  # All blocks are single line blocks.
                        for para_block in para_blocks:
                            new_para_blocks.append(para_block)
                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
                    para_blocks = new_para_blocks

                blocks["para_blocks"] = para_blocks

                for para_block in para_blocks:
                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
                    if (
                        all_titles and para_text_len < 200
                    ):  # total length of the paragraph is less than 200, more than this should not be a title
                        para_block["is_block_title"] = 1
                    else:
                        para_block["is_block_title"] = 0

                    all_name_or_org_list_to_be_removed = all(
                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
                    )
                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
                        para_block["is_block_an_author_or_org_list"] = 1
                    else:
                        para_block["is_block_an_author_or_org_list"] = 0

        pdf_dic["statistics"]["num_titles"] = num_titles

        return pdf_dic

    def __determine_size_based_level(self, title_blocks):
        """
        This function determines the title level based on the font size of the title.

        Parameters
        ----------
        title_blocks : list

        Returns
        -------
        title_blocks : list
        """

        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])

        # Use the mean and std of font sizes to remove extreme values
        mean_font_size = np.mean(font_sizes)
        std_font_size = np.std(font_sizes)
        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore

        # Compute the threshold for title level
        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
        if middle_font_sizes.size > 0:
            middle_mean_font_size = np.mean(middle_font_sizes)
            level_threshold = middle_mean_font_size
        else:
            level_threshold = mean_font_size

        for tb in title_blocks:
            title_block = tb["block"]
            title_font_size = safe_get(title_block, "block_font_size", 0)

            current_level = 1  # Initialize title level, the biggest level is 1

            # print(f"Before adjustment by font size, {current_level}")
            if title_font_size >= max_extreme_font_size:
                current_level = 1
            elif title_font_size <= min_extreme_font_size:
                current_level = 3
            elif float(title_font_size) >= float(level_threshold):
                current_level = 2
            else:
                current_level = 3
            # print(f"After adjustment by font size, {current_level}")

            title_block["block_title_level"] = current_level

        return title_blocks

    def batch_process_blocks_recog_title_level(self, pdf_dic):
        title_blocks = []

        # Collect all titles
        for page_id, blocks in pdf_dic.items():
            if page_id.startswith("page_"):
                para_blocks = blocks.get("para_blocks", [])
                for block in para_blocks:
                    if block.get("is_block_title"):
                        title_obj = {"page_id": page_id, "block": block}
                        title_blocks.append(title_obj)

        # Determine title level
        if title_blocks:
            # Determine title level based on font size
            title_blocks = self.__determine_size_based_level(title_blocks)

        return pdf_dic