10 months ago · 60054febfe
--- a/magic_pdf/post_proc/llm_aided.py
+++ b/magic_pdf/post_proc/llm_aided.py
@@ -83,24 +83,38 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
 
															             if block["type"] == "title":
														
 
															                 origin_title_list.append(block)
														
 
															                 title_text = merge_para_with_text(block)
														
 
															-                title_dict[f"{i}"] = title_text
														
 
															+                page_line_height_list = []
														
 
															+                for line in block['lines']:
														
 
															+                    bbox = line['bbox']
														
 
															+                    page_line_height_list.append(int(bbox[3] - bbox[1]))
														
 
															+                if len(page_line_height_list) > 0:
														
 
															+                    line_avg_height = sum(page_line_height_list) / len(page_line_height_list)
														
 
															+                else:
														
 
															+                    line_avg_height = int(block['bbox'][3] - block['bbox'][1])
														
 
															+                title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1]
														
 
															                 i += 1
														
 
															     # logger.info(f"Title list: {title_dict}")
														
 
															     title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典，请根据以下指南优化标题的结果，使结果符合正常文档的层次结构：
														
 
															-1. 保留原始内容：
														
 
															+1. 字典中每个value均为一个list，包含以下元素：
														
 
															+    - 标题文本
														
 
															+    - 文本行高是标题所在块的平均行高
														
 
															+    - 标题所在的页码
														
 
															+
														
 
															+2. 保留原始内容：
														
 
															     - 输入的字典中所有元素都是有效的，不能删除字典中的任何元素
														
 
															     - 请务必保证输出的字典中元素的数量和输入的数量一致
														
 
															-2. 保持字典内key-value的对应关系不变
														
 
															+3. 保持字典内key-value的对应关系不变
														
 
															-3. 优化层次结构：
														
 
															+4. 优化层次结构：
														
 
															     - 为每个标题元素添加适当的层次结构
														
 
															-    - 标题层级应具有连续性，不能跳过某一层级
														
 
															+    - 行高较大的标题一般是更高级别的标题
														
 
															+    - 标题从前至后的层级必须是连续的，不能跳过层级
														
 
															     - 标题层级最多为4级，不要添加过多的层级
														
 
															-    - 优化后的标题为一个整数，代表该标题的层级
														
 
															-
														
 
															+    - 优化后的标题只保留代表该标题的层级的整数，不要保留其他信息
														
 
															+    
														
 
															 IMPORTANT: 
														
 
															 请直接返回优化过的由标题层级组成的json，返回的json不需要格式化。
														
@@ -110,24 +124,36 @@ Input title list:
 
															 Corrected title list:
														
 
															 """
														
 
															-    completion = client.chat.completions.create(
														
 
															-        model=title_aided_config["model"],
														
 
															-        messages=[
														
 
															-            {'role': 'user', 'content': title_optimize_prompt}],
														
 
															-        temperature=0.7,
														
 
															-    )
														
 
															-
														
 
															-    json_completion = json.loads(completion.choices[0].message.content)
														
 
															-
														
 
															-    # logger.info(f"Title completion: {json_completion}")
														
 
															+    retry_count = 0
														
 
															+    max_retries = 3
														
 
															+    json_completion = None
														
 
															-    # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
														
 
															-    if len(json_completion) == len(title_dict):
														
 
															+    while retry_count < max_retries:
														
 
															         try:
														
 
															-            for i, origin_title_block in enumerate(origin_title_list):
														
 
															-               origin_title_block["level"] = int(json_completion[str(i)])
														
 
															+            completion = client.chat.completions.create(
														
 
															+                model=title_aided_config["model"],
														
 
															+                messages=[
														
 
															+                    {'role': 'user', 'content': title_optimize_prompt}],
														
 
															+                temperature=0.7,
														
 
															+            )
														
 
															+            json_completion = json.loads(completion.choices[0].message.content)
														
 
															+
														
 
															+            # logger.info(f"Title completion: {json_completion}")
														
 
															+            # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
														
 
															+
														
 
															+            if len(json_completion) == len(title_dict):
														
 
															+                for i, origin_title_block in enumerate(origin_title_list):
														
 
															+                    origin_title_block["level"] = int(json_completion[str(i)])
														
 
															+                break
														
 
															+            else:
														
 
															+                logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
														
 
															+                retry_count += 1
														
 
															         except Exception as e:
														
 
															-            logger.exception(e)
														
 
															-    else:
														
 
															-        logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
														
 
															-
														
 
															+            if e is json.JSONDecodeError:
														
 
															+                logger.warning(f"JSON decode error on attempt {retry_count + 1}: {e}")
														
 
															+            else:
														
 
															+                logger.exception(e)
														
 
															+            retry_count += 1
														
 
															+
														
 
															+    if json_completion is None:
														
 
															+        logger.error("Failed to decode JSON after maximum retries.")