| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438 |
- import math
- from loguru import logger
- from magic_pdf.config.ocr_content_type import ContentType
- from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
- find_top_nearest_text_bbox)
- from magic_pdf.libs.commons import join_path
- TYPE_INLINE_EQUATION = ContentType.InlineEquation
- TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
- UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
- @DeprecationWarning
- def mk_nlp_markdown_1(para_dict: dict):
- """对排序后的bboxes拼接内容."""
- content_lst = []
- for _, page_info in para_dict.items():
- para_blocks = page_info.get('para_blocks')
- if not para_blocks:
- continue
- for block in para_blocks:
- item = block['paras']
- for _, p in item.items():
- para_text = p['para_text']
- is_title = p['is_para_title']
- title_level = p['para_title_level']
- md_title_prefix = '#' * title_level
- if is_title:
- content_lst.append(f'{md_title_prefix} {para_text}')
- else:
- content_lst.append(para_text)
- content_text = '\n\n'.join(content_lst)
- return content_text
- # 找到目标字符串在段落中的索引
- def __find_index(paragraph, target):
- index = paragraph.find(target)
- if index != -1:
- return index
- else:
- return None
- def __insert_string(paragraph, target, position):
- new_paragraph = paragraph[:position] + target + paragraph[position:]
- return new_paragraph
- def __insert_after(content, image_content, target):
- """在content中找到target,将image_content插入到target后面."""
- index = content.find(target)
- if index != -1:
- content = (
- content[: index + len(target)]
- + '\n\n'
- + image_content
- + '\n\n'
- + content[index + len(target) :]
- )
- else:
- logger.error(
- f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
- )
- return content
- def __insert_before(content, image_content, target):
- """在content中找到target,将image_content插入到target前面."""
- index = content.find(target)
- if index != -1:
- content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
- else:
- logger.error(
- f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
- )
- return content
- @DeprecationWarning
- def mk_mm_markdown_1(para_dict: dict):
- """拼装多模态markdown."""
- content_lst = []
- for _, page_info in para_dict.items():
- page_lst = [] # 一个page内的段落列表
- para_blocks = page_info.get('para_blocks')
- pymu_raw_blocks = page_info.get('preproc_blocks')
- all_page_images = []
- all_page_images.extend(page_info.get('images', []))
- all_page_images.extend(page_info.get('image_backup', []))
- all_page_images.extend(page_info.get('tables', []))
- all_page_images.extend(page_info.get('table_backup', []))
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
- for img in all_page_images:
- page_lst.append(f"") # TODO 图片顺序
- page_md = '\n\n'.join(page_lst)
- else:
- for block in para_blocks:
- item = block['paras']
- for _, p in item.items():
- para_text = p['para_text']
- is_title = p['is_para_title']
- title_level = p['para_title_level']
- md_title_prefix = '#' * title_level
- if is_title:
- page_lst.append(f'{md_title_prefix} {para_text}')
- else:
- page_lst.append(para_text)
- """拼装成一个页面的文本"""
- page_md = '\n\n'.join(page_lst)
- """插入图片"""
- for img in all_page_images:
- imgbox = img['bbox']
- img_content = f""
- # 先看在哪个block内
- for block in pymu_raw_blocks:
- bbox = block['bbox']
- if (
- bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
- and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
- ): # 确定在block内
- for l in block['lines']: # noqa: E741
- line_box = l['bbox']
- if (
- line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
- and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
- ): # 在line内的,插入line前面
- line_txt = ''.join([s['text'] for s in l['spans']])
- page_md = __insert_before(
- page_md, img_content, line_txt
- )
- break
- break
- else: # 在行与行之间
- # 找到图片x0,y0与line的x0,y0最近的line
- min_distance = 100000
- min_line = None
- for l in block['lines']: # noqa: E741
- line_box = l['bbox']
- distance = math.sqrt(
- (line_box[0] - imgbox[0]) ** 2
- + (line_box[1] - imgbox[1]) ** 2
- )
- if distance < min_distance:
- min_distance = distance
- min_line = l
- if min_line:
- line_txt = ''.join(
- [s['text'] for s in min_line['spans']]
- )
- img_h = imgbox[3] - imgbox[1]
- if min_distance < img_h: # 文字在图片前面
- page_md = __insert_after(
- page_md, img_content, line_txt
- )
- else:
- page_md = __insert_before(
- page_md, img_content, line_txt
- )
- else:
- logger.error(
- f"Can't find the location of image {img['image_path']} in the markdown file #1"
- )
- else: # 应当在两个block之间
- # 找到上方最近的block,如果上方没有就找大下方最近的block
- top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
- if top_txt_block:
- line_txt = ''.join(
- [s['text'] for s in top_txt_block['lines'][-1]['spans']]
- )
- page_md = __insert_after(page_md, img_content, line_txt)
- else:
- bottom_txt_block = find_bottom_nearest_text_bbox(
- pymu_raw_blocks, imgbox
- )
- if bottom_txt_block:
- line_txt = ''.join(
- [
- s['text']
- for s in bottom_txt_block['lines'][0]['spans']
- ]
- )
- page_md = __insert_before(page_md, img_content, line_txt)
- else:
- logger.error(
- f"Can't find the location of image {img['image_path']} in the markdown file #2"
- )
- content_lst.append(page_md)
- """拼装成全部页面的文本"""
- content_text = '\n\n'.join(content_lst)
- return content_text
- def __insert_after_para(text, type, element, content_list):
- """在content_list中找到text,将image_path作为一个新的node插入到text后面."""
- for i, c in enumerate(content_list):
- content_type = c.get('type')
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
- if type == 'image':
- content_node = {
- 'type': 'image',
- 'img_path': element.get('image_path'),
- 'img_alt': '',
- 'img_title': '',
- 'img_caption': '',
- }
- elif type == 'table':
- content_node = {
- 'type': 'table',
- 'img_path': element.get('image_path'),
- 'table_latex': element.get('text'),
- 'table_title': '',
- 'table_caption': '',
- 'table_quality': element.get('quality'),
- }
- content_list.insert(i + 1, content_node)
- break
- else:
- logger.error(
- f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
- )
- def __insert_before_para(text, type, element, content_list):
- """在content_list中找到text,将image_path作为一个新的node插入到text前面."""
- for i, c in enumerate(content_list):
- content_type = c.get('type')
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
- if type == 'image':
- content_node = {
- 'type': 'image',
- 'img_path': element.get('image_path'),
- 'img_alt': '',
- 'img_title': '',
- 'img_caption': '',
- }
- elif type == 'table':
- content_node = {
- 'type': 'table',
- 'img_path': element.get('image_path'),
- 'table_latex': element.get('text'),
- 'table_title': '',
- 'table_caption': '',
- 'table_quality': element.get('quality'),
- }
- content_list.insert(i, content_node)
- break
- else:
- logger.error(
- f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
- )
- def mk_universal_format(pdf_info_list: list, img_buket_path):
- """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
- content_lst = []
- for page_info in pdf_info_list:
- page_lst = [] # 一个page内的段落列表
- para_blocks = page_info.get('para_blocks')
- pymu_raw_blocks = page_info.get('preproc_blocks')
- all_page_images = []
- all_page_images.extend(page_info.get('images', []))
- all_page_images.extend(page_info.get('image_backup', []))
- # all_page_images.extend(page_info.get("tables",[]))
- # all_page_images.extend(page_info.get("table_backup",[]) )
- all_page_tables = []
- all_page_tables.extend(page_info.get('tables', []))
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
- for img in all_page_images:
- content_node = {
- 'type': 'image',
- 'img_path': join_path(img_buket_path, img['image_path']),
- 'img_alt': '',
- 'img_title': '',
- 'img_caption': '',
- }
- page_lst.append(content_node) # TODO 图片顺序
- for table in all_page_tables:
- content_node = {
- 'type': 'table',
- 'img_path': join_path(img_buket_path, table['image_path']),
- 'table_latex': table.get('text'),
- 'table_title': '',
- 'table_caption': '',
- 'table_quality': table.get('quality'),
- }
- page_lst.append(content_node) # TODO 图片顺序
- else:
- for block in para_blocks:
- item = block['paras']
- for _, p in item.items():
- font_type = p[
- 'para_font_type'
- ] # 对于文本来说,要么是普通文本,要么是个行间公式
- if font_type == TYPE_INTERLINE_EQUATION:
- content_node = {'type': 'equation', 'latex': p['para_text']}
- page_lst.append(content_node)
- else:
- para_text = p['para_text']
- is_title = p['is_para_title']
- title_level = p['para_title_level']
- if is_title:
- content_node = {
- 'type': f'h{title_level}',
- 'text': para_text,
- }
- page_lst.append(content_node)
- else:
- content_node = {'type': 'text', 'text': para_text}
- page_lst.append(content_node)
- content_lst.extend(page_lst)
- """插入图片"""
- for img in all_page_images:
- insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
- """插入表格"""
- for table in all_page_tables:
- insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
- # end for
- return content_lst
- def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
- element_bbox = element['bbox']
- # 先看在哪个block内
- for block in pymu_raw_blocks:
- bbox = block['bbox']
- if (
- bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
- and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
- ): # 确定在这个大的block内,然后进入逐行比较距离
- for l in block['lines']: # noqa: E741
- line_box = l['bbox']
- if (
- line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
- and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
- ): # 在line内的,插入line前面
- line_txt = ''.join([s['text'] for s in l['spans']])
- __insert_before_para(line_txt, type, element, content_lst)
- break
- break
- else: # 在行与行之间
- # 找到图片x0,y0与line的x0,y0最近的line
- min_distance = 100000
- min_line = None
- for l in block['lines']: # noqa: E741
- line_box = l['bbox']
- distance = math.sqrt(
- (line_box[0] - element_bbox[0]) ** 2
- + (line_box[1] - element_bbox[1]) ** 2
- )
- if distance < min_distance:
- min_distance = distance
- min_line = l
- if min_line:
- line_txt = ''.join([s['text'] for s in min_line['spans']])
- img_h = element_bbox[3] - element_bbox[1]
- if min_distance < img_h: # 文字在图片前面
- __insert_after_para(line_txt, type, element, content_lst)
- else:
- __insert_before_para(line_txt, type, element, content_lst)
- break
- else:
- logger.error(
- f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
- )
- else: # 应当在两个block之间
- # 找到上方最近的block,如果上方没有就找大下方最近的block
- top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
- if top_txt_block:
- line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
- __insert_after_para(line_txt, type, element, content_lst)
- else:
- bottom_txt_block = find_bottom_nearest_text_bbox(
- pymu_raw_blocks, element_bbox
- )
- if bottom_txt_block:
- line_txt = ''.join(
- [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
- )
- __insert_before_para(line_txt, type, element, content_lst)
- else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
- logger.error(
- f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
- )
- def mk_mm_markdown(content_list):
- """基于同一格式的内容列表,构造markdown,含图片."""
- content_md = []
- for c in content_list:
- content_type = c.get('type')
- if content_type == 'text':
- content_md.append(c.get('text'))
- elif content_type == 'equation':
- content = c.get('latex')
- if content.startswith('$$') and content.endswith('$$'):
- content_md.append(content)
- else:
- content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
- elif content_type in UNI_FORMAT_TEXT_TYPE:
- content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
- elif content_type == 'image':
- content_md.append(f"})")
- return '\n\n'.join(content_md)
- def mk_nlp_markdown(content_list):
- """基于同一格式的内容列表,构造markdown,不含图片."""
- content_md = []
- for c in content_list:
- content_type = c.get('type')
- if content_type == 'text':
- content_md.append(c.get('text'))
- elif content_type == 'equation':
- content_md.append(f"$$\n{c.get('latex')}\n$$")
- elif content_type == 'table':
- content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
- elif content_type in UNI_FORMAT_TEXT_TYPE:
- content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
- return '\n\n'.join(content_md)
|