| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- import re
- from os import path
- from collections import Counter
- from loguru import logger
- # from langdetect import detect
- import spacy
- import en_core_web_sm
- import zh_core_web_sm
- from magic_pdf.libs.language import detect_lang
- class NLPModels:
- """
- How to upload local models to s3:
- - config aws cli:
- doc\SETUP-CLI.md
- doc\setup_cli.sh
- app\config\__init__.py
- - $ cd {local_dir_storing_models}
- - $ ls models
- en_core_web_sm-3.7.1/
- zh_core_web_sm-3.7.0/
- - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
- - $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
- PRE en_core_web_sm-3.7.1/
- PRE zh_core_web_sm-3.7.0/
- """
- def __init__(self):
- # if OS is windows, set "TMP_DIR" to "D:/tmp"
- home_dir = path.expanduser("~")
- self.default_local_path = path.join(home_dir, ".nlp_models")
- self.default_shared_path = "/share/pdf_processor/nlp_models"
- self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
- self.default_s3_path = "s3://llm-infra/models"
- self.nlp_models = self.nlp_models = {
- "en_core_web_sm": {
- "type": "spacy",
- "version": "3.7.1",
- },
- "en_core_web_md": {
- "type": "spacy",
- "version": "3.7.1",
- },
- "en_core_web_lg": {
- "type": "spacy",
- "version": "3.7.1",
- },
- "zh_core_web_sm": {
- "type": "spacy",
- "version": "3.7.0",
- },
- "zh_core_web_md": {
- "type": "spacy",
- "version": "3.7.0",
- },
- "zh_core_web_lg": {
- "type": "spacy",
- "version": "3.7.0",
- },
- }
- self.en_core_web_sm_model = en_core_web_sm.load()
- self.zh_core_web_sm_model = zh_core_web_sm.load()
- def load_model(self, model_name, model_type, model_version):
- if (
- model_name in self.nlp_models
- and self.nlp_models[model_name]["type"] == model_type
- and self.nlp_models[model_name]["version"] == model_version
- ):
- return spacy.load(model_name) if spacy.util.is_package(model_name) else None
- else:
- logger.error(f"Unsupported model name or version: {model_name} {model_version}")
- return None
- def detect_language(self, text, use_langdetect=False):
- if len(text) == 0:
- return None
- if use_langdetect:
- # print("use_langdetect")
- # print(detect_lang(text))
- # return detect_lang(text)
- if detect_lang(text) == "zh":
- return "zh"
- else:
- return "en"
- if not use_langdetect:
- en_count = len(re.findall(r"[a-zA-Z]", text))
- cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
- if en_count > cn_count:
- return "en"
- if cn_count > en_count:
- return "zh"
- def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
- """
- Detect entity categories using NLP models and return the most frequent entity types.
- Parameters
- ----------
- text : str
- Text to be processed.
- Returns
- -------
- str
- The most frequent entity type.
- """
- lang = self.detect_language(text, use_langdetect=True)
- if lang == "en":
- nlp_model = self.en_core_web_sm_model
- elif lang == "zh":
- nlp_model = self.zh_core_web_sm_model
- else:
- # logger.error(f"Unsupported language: {lang}")
- return {}
- # Splitting text into smaller parts
- text_parts = re.split(r"[,;,;、\s & |]+", text)
- text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
- text_combined = " ".join(text_parts)
- try:
- doc = nlp_model(text_combined)
- entity_counts = Counter([ent.label_ for ent in doc.ents])
- word_counts_in_entities = Counter()
- for ent in doc.ents:
- word_counts_in_entities[ent.label_] += len(ent.text.split())
- total_words_in_entities = sum(word_counts_in_entities.values())
- total_words = len([token for token in doc if not token.is_punct])
- if total_words_in_entities == 0 or total_words == 0:
- return None
- entity_percentage = total_words_in_entities / total_words
- if entity_percentage < 0.5:
- return None
- most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
- entity_percentage = word_count / total_words_in_entities
- if entity_percentage >= threshold:
- return most_common_entity
- else:
- return None
- except Exception as e:
- logger.error(f"Error in entity detection: {e}")
- return None
- def __main__():
- nlpModel = NLPModels()
- test_strings = [
- "张三",
- "张三, 李四,王五; 赵六",
- "John Doe",
- "Jane Smith",
- "Lee, John",
- "John Doe, Jane Smith; Alice Johnson,Bob Lee",
- "孙七, Michael Jordan;赵八",
- "David Smith Michael O'Connor; Kevin ßáçøñ",
- "李雷·韩梅梅, 张三·李四",
- "Charles Robert Darwin, Isaac Newton",
- "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
- "John Doe, Jane Smith; Alice Johnson",
- "张三, 李四,王五; 赵六",
- "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
- "Rachel Mills & William Barry & Susanne B. Haga",
- "Claire Chabut* and Jean-François Bussières",
- "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
- "Changchun",
- "china",
- "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
- "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
- "Synergistic Effect of Supported Nickel Catalyst with",
- "Intumescent Flame-Retardants on Flame Retardancy",
- "and Thermal Stability of Polypropylene",
- ]
- for test in test_strings:
- print()
- print(f"Original String: {test}")
- result = nlpModel.detect_entity_catgr_using_nlp(test)
- print(f"Detected entities: {result}")
- if __name__ == "__main__":
- __main__()
|