nlp_utils.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. import re
  2. from os import path
  3. from collections import Counter
  4. from loguru import logger
  5. # from langdetect import detect
  6. import spacy
  7. import en_core_web_sm
  8. import zh_core_web_sm
  9. from magic_pdf.libs.language import detect_lang
  10. class NLPModels:
  11. """
  12. How to upload local models to s3:
  13. - config aws cli:
  14. doc\SETUP-CLI.md
  15. doc\setup_cli.sh
  16. app\config\__init__.py
  17. - $ cd {local_dir_storing_models}
  18. - $ ls models
  19. en_core_web_sm-3.7.1/
  20. zh_core_web_sm-3.7.0/
  21. - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
  22. - $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
  23. PRE en_core_web_sm-3.7.1/
  24. PRE zh_core_web_sm-3.7.0/
  25. """
  26. def __init__(self):
  27. # if OS is windows, set "TMP_DIR" to "D:/tmp"
  28. home_dir = path.expanduser("~")
  29. self.default_local_path = path.join(home_dir, ".nlp_models")
  30. self.default_shared_path = "/share/pdf_processor/nlp_models"
  31. self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
  32. self.default_s3_path = "s3://llm-infra/models"
  33. self.nlp_models = self.nlp_models = {
  34. "en_core_web_sm": {
  35. "type": "spacy",
  36. "version": "3.7.1",
  37. },
  38. "en_core_web_md": {
  39. "type": "spacy",
  40. "version": "3.7.1",
  41. },
  42. "en_core_web_lg": {
  43. "type": "spacy",
  44. "version": "3.7.1",
  45. },
  46. "zh_core_web_sm": {
  47. "type": "spacy",
  48. "version": "3.7.0",
  49. },
  50. "zh_core_web_md": {
  51. "type": "spacy",
  52. "version": "3.7.0",
  53. },
  54. "zh_core_web_lg": {
  55. "type": "spacy",
  56. "version": "3.7.0",
  57. },
  58. }
  59. self.en_core_web_sm_model = en_core_web_sm.load()
  60. self.zh_core_web_sm_model = zh_core_web_sm.load()
  61. def load_model(self, model_name, model_type, model_version):
  62. if (
  63. model_name in self.nlp_models
  64. and self.nlp_models[model_name]["type"] == model_type
  65. and self.nlp_models[model_name]["version"] == model_version
  66. ):
  67. return spacy.load(model_name) if spacy.util.is_package(model_name) else None
  68. else:
  69. logger.error(f"Unsupported model name or version: {model_name} {model_version}")
  70. return None
  71. def detect_language(self, text, use_langdetect=False):
  72. if len(text) == 0:
  73. return None
  74. if use_langdetect:
  75. # print("use_langdetect")
  76. # print(detect_lang(text))
  77. # return detect_lang(text)
  78. if detect_lang(text) == "zh":
  79. return "zh"
  80. else:
  81. return "en"
  82. if not use_langdetect:
  83. en_count = len(re.findall(r"[a-zA-Z]", text))
  84. cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
  85. if en_count > cn_count:
  86. return "en"
  87. if cn_count > en_count:
  88. return "zh"
  89. def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
  90. """
  91. Detect entity categories using NLP models and return the most frequent entity types.
  92. Parameters
  93. ----------
  94. text : str
  95. Text to be processed.
  96. Returns
  97. -------
  98. str
  99. The most frequent entity type.
  100. """
  101. lang = self.detect_language(text, use_langdetect=True)
  102. if lang == "en":
  103. nlp_model = self.en_core_web_sm_model
  104. elif lang == "zh":
  105. nlp_model = self.zh_core_web_sm_model
  106. else:
  107. # logger.error(f"Unsupported language: {lang}")
  108. return {}
  109. # Splitting text into smaller parts
  110. text_parts = re.split(r"[,;,;、\s & |]+", text)
  111. text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
  112. text_combined = " ".join(text_parts)
  113. try:
  114. doc = nlp_model(text_combined)
  115. entity_counts = Counter([ent.label_ for ent in doc.ents])
  116. word_counts_in_entities = Counter()
  117. for ent in doc.ents:
  118. word_counts_in_entities[ent.label_] += len(ent.text.split())
  119. total_words_in_entities = sum(word_counts_in_entities.values())
  120. total_words = len([token for token in doc if not token.is_punct])
  121. if total_words_in_entities == 0 or total_words == 0:
  122. return None
  123. entity_percentage = total_words_in_entities / total_words
  124. if entity_percentage < 0.5:
  125. return None
  126. most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
  127. entity_percentage = word_count / total_words_in_entities
  128. if entity_percentage >= threshold:
  129. return most_common_entity
  130. else:
  131. return None
  132. except Exception as e:
  133. logger.error(f"Error in entity detection: {e}")
  134. return None
  135. def __main__():
  136. nlpModel = NLPModels()
  137. test_strings = [
  138. "张三",
  139. "张三, 李四,王五; 赵六",
  140. "John Doe",
  141. "Jane Smith",
  142. "Lee, John",
  143. "John Doe, Jane Smith; Alice Johnson,Bob Lee",
  144. "孙七, Michael Jordan;赵八",
  145. "David Smith Michael O'Connor; Kevin ßáçøñ",
  146. "李雷·韩梅梅, 张三·李四",
  147. "Charles Robert Darwin, Isaac Newton",
  148. "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
  149. "John Doe, Jane Smith; Alice Johnson",
  150. "张三, 李四,王五; 赵六",
  151. "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
  152. "Rachel Mills & William Barry & Susanne B. Haga",
  153. "Claire Chabut* and Jean-François Bussières",
  154. "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
  155. "Changchun",
  156. "china",
  157. "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
  158. "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
  159. "Synergistic Effect of Supported Nickel Catalyst with",
  160. "Intumescent Flame-Retardants on Flame Retardancy",
  161. "and Thermal Stability of Polypropylene",
  162. ]
  163. for test in test_strings:
  164. print()
  165. print(f"Original String: {test}")
  166. result = nlpModel.detect_entity_catgr_using_nlp(test)
  167. print(f"Detected entities: {result}")
  168. if __name__ == "__main__":
  169. __main__()