|
@@ -83,24 +83,38 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
|
|
|
if block["type"] == "title":
|
|
if block["type"] == "title":
|
|
|
origin_title_list.append(block)
|
|
origin_title_list.append(block)
|
|
|
title_text = merge_para_with_text(block)
|
|
title_text = merge_para_with_text(block)
|
|
|
- title_dict[f"{i}"] = title_text
|
|
|
|
|
|
|
+ page_line_height_list = []
|
|
|
|
|
+ for line in block['lines']:
|
|
|
|
|
+ bbox = line['bbox']
|
|
|
|
|
+ page_line_height_list.append(int(bbox[3] - bbox[1]))
|
|
|
|
|
+ if len(page_line_height_list) > 0:
|
|
|
|
|
+ line_avg_height = sum(page_line_height_list) / len(page_line_height_list)
|
|
|
|
|
+ else:
|
|
|
|
|
+ line_avg_height = int(block['bbox'][3] - block['bbox'][1])
|
|
|
|
|
+ title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1]
|
|
|
i += 1
|
|
i += 1
|
|
|
# logger.info(f"Title list: {title_dict}")
|
|
# logger.info(f"Title list: {title_dict}")
|
|
|
|
|
|
|
|
title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
|
|
title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
|
|
|
|
|
|
|
|
-1. 保留原始内容:
|
|
|
|
|
|
|
+1. 字典中每个value均为一个list,包含以下元素:
|
|
|
|
|
+ - 标题文本
|
|
|
|
|
+ - 文本行高是标题所在块的平均行高
|
|
|
|
|
+ - 标题所在的页码
|
|
|
|
|
+
|
|
|
|
|
+2. 保留原始内容:
|
|
|
- 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
|
|
- 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
|
|
|
- 请务必保证输出的字典中元素的数量和输入的数量一致
|
|
- 请务必保证输出的字典中元素的数量和输入的数量一致
|
|
|
|
|
|
|
|
-2. 保持字典内key-value的对应关系不变
|
|
|
|
|
|
|
+3. 保持字典内key-value的对应关系不变
|
|
|
|
|
|
|
|
-3. 优化层次结构:
|
|
|
|
|
|
|
+4. 优化层次结构:
|
|
|
- 为每个标题元素添加适当的层次结构
|
|
- 为每个标题元素添加适当的层次结构
|
|
|
- - 标题层级应具有连续性,不能跳过某一层级
|
|
|
|
|
|
|
+ - 行高较大的标题一般是更高级别的标题
|
|
|
|
|
+ - 标题从前至后的层级必须是连续的,不能跳过层级
|
|
|
- 标题层级最多为4级,不要添加过多的层级
|
|
- 标题层级最多为4级,不要添加过多的层级
|
|
|
- - 优化后的标题为一个整数,代表该标题的层级
|
|
|
|
|
-
|
|
|
|
|
|
|
+ - 优化后的标题只保留代表该标题的层级的整数,不要保留其他信息
|
|
|
|
|
+
|
|
|
IMPORTANT:
|
|
IMPORTANT:
|
|
|
请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
|
|
请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
|
|
|
|
|
|
|
@@ -110,24 +124,36 @@ Input title list:
|
|
|
Corrected title list:
|
|
Corrected title list:
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
- completion = client.chat.completions.create(
|
|
|
|
|
- model=title_aided_config["model"],
|
|
|
|
|
- messages=[
|
|
|
|
|
- {'role': 'user', 'content': title_optimize_prompt}],
|
|
|
|
|
- temperature=0.7,
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- json_completion = json.loads(completion.choices[0].message.content)
|
|
|
|
|
-
|
|
|
|
|
- # logger.info(f"Title completion: {json_completion}")
|
|
|
|
|
|
|
+ retry_count = 0
|
|
|
|
|
+ max_retries = 3
|
|
|
|
|
+ json_completion = None
|
|
|
|
|
|
|
|
- # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
|
|
|
|
|
- if len(json_completion) == len(title_dict):
|
|
|
|
|
|
|
+ while retry_count < max_retries:
|
|
|
try:
|
|
try:
|
|
|
- for i, origin_title_block in enumerate(origin_title_list):
|
|
|
|
|
- origin_title_block["level"] = int(json_completion[str(i)])
|
|
|
|
|
|
|
+ completion = client.chat.completions.create(
|
|
|
|
|
+ model=title_aided_config["model"],
|
|
|
|
|
+ messages=[
|
|
|
|
|
+ {'role': 'user', 'content': title_optimize_prompt}],
|
|
|
|
|
+ temperature=0.7,
|
|
|
|
|
+ )
|
|
|
|
|
+ json_completion = json.loads(completion.choices[0].message.content)
|
|
|
|
|
+
|
|
|
|
|
+ # logger.info(f"Title completion: {json_completion}")
|
|
|
|
|
+ # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
|
|
|
|
|
+
|
|
|
|
|
+ if len(json_completion) == len(title_dict):
|
|
|
|
|
+ for i, origin_title_block in enumerate(origin_title_list):
|
|
|
|
|
+ origin_title_block["level"] = int(json_completion[str(i)])
|
|
|
|
|
+ break
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
|
|
|
|
|
+ retry_count += 1
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- logger.exception(e)
|
|
|
|
|
- else:
|
|
|
|
|
- logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
|
|
|
|
|
-
|
|
|
|
|
|
|
+ if e is json.JSONDecodeError:
|
|
|
|
|
+ logger.warning(f"JSON decode error on attempt {retry_count + 1}: {e}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.exception(e)
|
|
|
|
|
+ retry_count += 1
|
|
|
|
|
+
|
|
|
|
|
+ if json_completion is None:
|
|
|
|
|
+ logger.error("Failed to decode JSON after maximum retries.")
|