1 rok temu · 011a1b973b
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,6 +1,5 @@
 
				 import re
			
 
				 
			
 
				-import wordninja
			
 
				 from loguru import logger
			
 
				 
			
 
				 from magic_pdf.libs.commons import join_path
			
@@ -25,37 +24,6 @@ def __is_hyphen_at_line_end(line):
 
				     return bool(re.search(r'[A-Za-z]+-\s*$', line))
			
 
				 
			
 
				 
			
 
				-def split_long_words(text):
			
 
				-    segments = text.split(' ')
			
 
				-    for i in range(len(segments)):
			
 
				-        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
			
 
				-        for j in range(len(words)):
			
 
				-            if len(words[j]) > 10:
			
 
				-                words[j] = ' '.join(wordninja.split(words[j]))
			
 
				-        segments[i] = ''.join(words)
			
 
				-    return ' '.join(segments)
			
 
				-
			
 
				-
			
 
				-def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
			
 
				-    markdown = []
			
 
				-    for page_info in pdf_info_list:
			
 
				-        paras_of_layout = page_info.get('para_blocks')
			
 
				-        page_markdown = ocr_mk_markdown_with_para_core_v2(
			
 
				-            paras_of_layout, 'mm', img_buket_path)
			
 
				-        markdown.extend(page_markdown)
			
 
				-    return '\n\n'.join(markdown)
			
 
				-
			
 
				-
			
 
				-def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
			
 
				-    markdown = []
			
 
				-    for page_info in pdf_info_dict:
			
 
				-        paras_of_layout = page_info.get('para_blocks')
			
 
				-        page_markdown = ocr_mk_markdown_with_para_core_v2(
			
 
				-            paras_of_layout, 'nlp')
			
 
				-        markdown.extend(page_markdown)
			
 
				-    return '\n\n'.join(markdown)
			
 
				-
			
 
				-
			
 
				 def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
			
 
				                                                 img_buket_path):
			
 
				     markdown_with_para_and_pagination = []
			
@@ -76,45 +44,6 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
 
				     return markdown_with_para_and_pagination
			
 
				 
			
 
				 
			
 
				-def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
			
 
				-    page_markdown = []
			
 
				-    for paras in paras_of_layout:
			
 
				-        for para in paras:
			
 
				-            para_text = ''
			
 
				-            for line in para:
			
 
				-                for span in line['spans']:
			
 
				-                    span_type = span.get('type')
			
 
				-                    content = ''
			
 
				-                    language = ''
			
 
				-                    if span_type == ContentType.Text:
			
 
				-                        content = span['content']
			
 
				-                        language = detect_lang(content)
			
 
				-                        if (language == 'en'):  # 只对英文长词进行分词处理，中文分词会丢失文本
			
 
				-                            content = ocr_escape_special_markdown_char(
			
 
				-                                split_long_words(content))
			
 
				-                        else:
			
 
				-                            content = ocr_escape_special_markdown_char(content)
			
 
				-                    elif span_type == ContentType.InlineEquation:
			
 
				-                        content = f"${span['content']}$"
			
 
				-                    elif span_type == ContentType.InterlineEquation:
			
 
				-                        content = f"\n$$\n{span['content']}\n$$\n"
			
 
				-                    elif span_type in [ContentType.Image, ContentType.Table]:
			
 
				-                        if mode == 'mm':
			
 
				-                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
			
 
				-                        elif mode == 'nlp':
			
 
				-                            pass
			
 
				-                    if content != '':
			
 
				-                        if language == 'en':  # 英文语境下 content间需要空格分隔
			
 
				-                            para_text += content + ' '
			
 
				-                        else:  # 中文语境下，content间不需要空格分隔
			
 
				-                            para_text += content
			
 
				-            if para_text.strip() == '':
			
 
				-                continue
			
 
				-            else:
			
 
				-                page_markdown.append(para_text.strip() + '  ')
			
 
				-    return page_markdown
			
 
				-
			
 
				-
			
 
				 def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
			
 
				                                       mode,
			
 
				                                       img_buket_path='',
			
@@ -207,21 +136,11 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
 
				         if line_text != '':
			
 
				             line_lang = detect_lang(line_text)
			
 
				         for span in line['spans']:
			
 
				+
			
 
				             span_type = span['type']
			
 
				             content = ''
			
 
				             if span_type == ContentType.Text:
			
 
				-                content = span['content']
			
 
				-                # language = detect_lang(content)
			
 
				-                language = detect_language(content)
			
 
				-                # 判断是否小语种
			
 
				-                if lang is not None and lang != 'en':
			
 
				-                    content = ocr_escape_special_markdown_char(content)
			
 
				-                else:  # 非小语种逻辑
			
 
				-                    if language == 'en' and parse_type == 'ocr':  # 只对英文长词进行分词处理，中文分词会丢失文本
			
 
				-                        content = ocr_escape_special_markdown_char(
			
 
				-                            split_long_words(content))
			
 
				-                    else:
			
 
				-                        content = ocr_escape_special_markdown_char(content)
			
 
				+                content = ocr_escape_special_markdown_char(span['content'])
			
 
				             elif span_type == ContentType.InlineEquation:
			
 
				                 content = f" ${span['content']}$ "
			
 
				             elif span_type == ContentType.InterlineEquation:
			
@@ -242,41 +161,6 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
 
				     return para_text
			
 
				 
			
 
				 
			
 
				-def para_to_standard_format(para, img_buket_path):
			
 
				-    para_content = {}
			
 
				-    if len(para) == 1:
			
 
				-        para_content = line_to_standard_format(para[0], img_buket_path)
			
 
				-    elif len(para) > 1:
			
 
				-        para_text = ''
			
 
				-        inline_equation_num = 0
			
 
				-        for line in para:
			
 
				-            for span in line['spans']:
			
 
				-                language = ''
			
 
				-                span_type = span.get('type')
			
 
				-                content = ''
			
 
				-                if span_type == ContentType.Text:
			
 
				-                    content = span['content']
			
 
				-                    language = detect_lang(content)
			
 
				-                    if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
			
 
				-                        content = ocr_escape_special_markdown_char(
			
 
				-                            split_long_words(content))
			
 
				-                    else:
			
 
				-                        content = ocr_escape_special_markdown_char(content)
			
 
				-                elif span_type == ContentType.InlineEquation:
			
 
				-                    content = f"${span['content']}$"
			
 
				-                    inline_equation_num += 1
			
 
				-                if language == 'en':  # 英文语境下 content间需要空格分隔
			
 
				-                    para_text += content + ' '
			
 
				-                else:  # 中文语境下，content间不需要空格分隔
			
 
				-                    para_text += content
			
 
				-        para_content = {
			
 
				-            'type': 'text',
			
 
				-            'text': para_text,
			
 
				-            'inline_equation_num': inline_equation_num,
			
 
				-        }
			
 
				-    return para_content
			
 
				-
			
 
				-
			
 
				 def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
			
 
				     para_type = para_block['type']
			
 
				     para_content = {}
			
@@ -330,82 +214,6 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
 
				     return para_content
			
 
				 
			
 
				 
			
 
				-def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
			
 
				-    content_list = []
			
 
				-    for page_info in pdf_info_dict:
			
 
				-        paras_of_layout = page_info.get('para_blocks')
			
 
				-        if not paras_of_layout:
			
 
				-            continue
			
 
				-        for para_block in paras_of_layout:
			
 
				-            para_content = para_to_standard_format_v2(para_block,
			
 
				-                                                      img_buket_path)
			
 
				-            content_list.append(para_content)
			
 
				-    return content_list
			
 
				-
			
 
				-
			
 
				-def line_to_standard_format(line, img_buket_path):
			
 
				-    line_text = ''
			
 
				-    inline_equation_num = 0
			
 
				-    for span in line['spans']:
			
 
				-        if not span.get('content'):
			
 
				-            if not span.get('image_path'):
			
 
				-                continue
			
 
				-            else:
			
 
				-                if span['type'] == ContentType.Image:
			
 
				-                    content = {
			
 
				-                        'type': 'image',
			
 
				-                        'img_path': join_path(img_buket_path,
			
 
				-                                              span['image_path']),
			
 
				-                    }
			
 
				-                    return content
			
 
				-                elif span['type'] == ContentType.Table:
			
 
				-                    content = {
			
 
				-                        'type': 'table',
			
 
				-                        'img_path': join_path(img_buket_path,
			
 
				-                                              span['image_path']),
			
 
				-                    }
			
 
				-                    return content
			
 
				-        else:
			
 
				-            if span['type'] == ContentType.InterlineEquation:
			
 
				-                interline_equation = span['content']
			
 
				-                content = {
			
 
				-                    'type': 'equation',
			
 
				-                    'latex': f'$$\n{interline_equation}\n$$'
			
 
				-                }
			
 
				-                return content
			
 
				-            elif span['type'] == ContentType.InlineEquation:
			
 
				-                inline_equation = span['content']
			
 
				-                line_text += f'${inline_equation}$'
			
 
				-                inline_equation_num += 1
			
 
				-            elif span['type'] == ContentType.Text:
			
 
				-                text_content = ocr_escape_special_markdown_char(
			
 
				-                    span['content'])  # 转义特殊符号
			
 
				-                line_text += text_content
			
 
				-    content = {
			
 
				-        'type': 'text',
			
 
				-        'text': line_text,
			
 
				-        'inline_equation_num': inline_equation_num,
			
 
				-    }
			
 
				-    return content
			
 
				-
			
 
				-
			
 
				-def ocr_mk_mm_standard_format(pdf_info_dict: list):
			
 
				-    """content_list type         string
			
 
				-    image/text/table/equation(行间的单独拿出来，行内的和text合并) latex        string
			
 
				-    latex文本字段。 text         string      纯文本格式的文本数据。 md           string
			
 
				-    markdown格式的文本数据。 img_path     string      s3://full/path/to/img.jpg."""
			
 
				-    content_list = []
			
 
				-    for page_info in pdf_info_dict:
			
 
				-        blocks = page_info.get('preproc_blocks')
			
 
				-        if not blocks:
			
 
				-            continue
			
 
				-        for block in blocks:
			
 
				-            for line in block['lines']:
			
 
				-                content = line_to_standard_format(line)
			
 
				-                content_list.append(content)
			
 
				-    return content_list
			
 
				-
			
 
				-
			
 
				 def union_make(pdf_info_dict: list,
			
 
				                make_mode: str,
			
 
				                drop_mode: str,
			
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -77,11 +77,11 @@ def layout_model_init(weight, config_file, device):
 
				     return model
			
 
				 
			
 
				 
			
 
				-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
			
 
				+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4):
			
 
				     if lang is not None:
			
 
				-        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
			
 
				+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
			
 
				     else:
			
 
				-        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
			
 
				+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
			
 
				     return model
			
 
				 
			
 
				 
			
--- a/requirements-docker.txt
+++ b/requirements-docker.txt
@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9
 
				 loguru>=0.6.0
			
 
				 numpy>=1.21.6,<2.0.0
			
 
				 fast-langdetect==0.2.0
			
 
				-wordninja>=2.0.0
			
 
				 scikit-learn>=1.0.2
			
 
				 pdfminer.six==20231228
			
 
				 unimernet==0.2.1
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,6 @@ pdfminer.six==20231228
 
				 pydantic>=2.7.2,<2.8.0
			
 
				 PyMuPDF>=1.24.9
			
 
				 scikit-learn>=1.0.2
			
 
				-wordninja>=2.0.0
			
 
				 torch>=2.2.2,<=2.3.1
			
 
				 transformers
			
 
				 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.