Эх сурвалжийг харах

增加生成多模态markdown逻辑

赵小蒙 1 жил өмнө
parent
commit
ec1a6ef716

+ 28 - 0
magic_pdf/dict2md/ocr_mkcontent.py

@@ -21,3 +21,31 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                 # 在行末添加两个空格以强制换行
                 markdown.append(line_text.strip() + '  ')
     return '\n'.join(markdown)
+
+def mk_mm_markdown(pdf_info_dict: dict):
+
+    markdown = []
+
+    for _, page_info in pdf_info_dict.items():
+        blocks = page_info.get("preproc_blocks")
+        if not blocks:
+            continue
+        for block in blocks:
+            for line in block['lines']:
+                line_text = ''
+                for span in line['spans']:
+                    if not span.get('content'):
+                        if not span.get('image_path'):
+                            continue
+                        else:
+                            content = f"![]({span['image_path']})"
+                    else:
+                        content = span['content'].replace('$', '\$')  # 转义$
+                        if span['type'] == 'inline_equation':
+                            content = f"${content}$"
+                        elif span['type'] == 'displayed_equation':
+                            content = f"$$\n{content}\n$$"
+                    line_text += content + ' '
+                # 在行末添加两个空格以强制换行
+                markdown.append(line_text.strip() + '  ')
+    return '\n'.join(markdown)

+ 2 - 2
magic_pdf/pre_proc/ocr_cut_image.py

@@ -12,8 +12,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
     for span in spans:
         span_type = span['type']
         if span_type == 'image':
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('image'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
         elif span_type == 'table':
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('table'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
 
     return spans