您要做的是将部分文本映射到在 NLP 中称为 entity linking 的真实世界实体。
您可以使用您指定的模式通过实体标尺扩展通常的 spaCy 管道(例如“en_core_web_lg”),以确保所有出现的文本都被识别为实体。您可以进一步编写一个实体链接器(比one provided by spaCy 简单得多),检查知识库中文本实体应该映射到哪个真实世界的实体。
您可以从同一个字典 (ENTITIES_MAP) 创建模式(用于实体标尺)和知识库(用于实体链接器),以便于扩展。
使用 spaCy 3,它可能看起来像这样。 (请注意,链接器可以简化。我遵循official component 的一般结构,使其复杂一些。)
from typing import Callable, Optional, List, Iterable, Dict
import spacy
from spacy import Language, Vocab
from spacy.kb import KnowledgeBase
from spacy.tokens import Doc
# Map from entity to how it could look like in a text
# This can be edited in order to change the patterns and kb
ENTITIES_MAP = {
"harman": [
"harman",
"Harman",
"HARMAN",
"HARMAN International",
"Harman International",
],
"other_company": [
"Other Company",
"OC"
]
}
class CustomLinker:
def __init__(self, vocab: Vocab) -> None:
self._vocab = vocab
self._kb: Optional[KnowledgeBase] = None
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]) -> None:
self._kb = kb_loader(self._vocab)
def __call__(self, doc: Doc) -> Doc:
kb_ids = self.predict([doc])
self.set_annotations([doc], kb_ids)
return doc
def predict(self, docs: Iterable[Doc]) -> List[str]:
assert self._kb is not None, "You forgot to call 'set_kb()'"
assert len(self._kb) > 0, "kb is empty"
kb_ids = []
for doc in docs:
for i, ent in enumerate(doc.ents):
candidates = self._kb.get_alias_candidates(ent.text)
if not candidates:
kb_ids.append("NIL")
elif len(candidates) == 1:
kb_ids.append(candidates[0].entity_)
else:
assert False, "The kb was set up ambiguously"
return kb_ids
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
count_ents = len([ent for doc in docs for ent in doc.ents])
assert count_ents == len(
kb_ids
), f"Number of entities is {count_ents}, but number of kb_ids is {len(kb_ids)}"
i = 0
for doc in docs:
for ent in doc.ents:
kb_id = kb_ids[i]
i += 1
for token in ent:
token.ent_kb_id_ = kb_id
@Language.factory("custom_linker")
def make_entity_linker(nlp: Language, name: str) -> CustomLinker:
return CustomLinker(nlp.vocab)
def create_pipeline() -> Language:
nlp = spacy.load("en_core_web_lg")
# Entity ruler to make sure all occurrences of harman are recognized as entities at all
patterns = [
{"label": "ORG", "pattern": [{"TEXT": word} for word in alias.split(" ")]}
for alias in [alias for aliases in ENTITIES_MAP.values() for alias in aliases]
]
ruler = nlp.add_pipe("entity_ruler", last=True)
ruler.add_patterns(patterns)
# Entity linker to link all harman entities to the same real-world entity
def create_kb(vocab):
kb = KnowledgeBase(vocab, entity_vector_length=300)
for name, aliases in ENTITIES_MAP.items():
kb.add_entity(name, freq=1, entity_vector=vocab[name].vector)
for alias in aliases:
kb.add_alias(alias, entities=[name], probabilities=[1.0])
return kb
linker = nlp.add_pipe("custom_linker")
linker.set_kb(create_kb)
return nlp
def main():
nlp = create_pipeline()
text = (
"The company HARMAN International is doing NLP."
" Some other names for it are harman, Harman, Harman International and HARMAN, but not HI."
" Other Company is a different one. Some also call it by the abbreviation OC."
)
doc = nlp(text)
print([(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
distinct_real_world_entities = set([ent.kb_id_ for ent in doc.ents]) - {"NIL"}
for rwe in distinct_real_world_entities:
count = len([ent for ent in doc.ents if ent.kb_id_ == rwe])
print(f"'{rwe}' occurs {count} times")
if __name__ == "__main__":
main()
它产生以下输出:
[('HARMAN International', 'ORG', 'harman'), ('NLP', 'ORG', 'NIL'), ('harman', 'ORG', 'harman'), ('Harman', 'ORG', 'harman'), ('Harman International', 'ORG', 'harman'), ('HARMAN', 'ORG', 'harman'), ('Other Company', 'ORG', 'other_company'), ('OC', 'ORG', 'other_company')]
'harman' occurs 5 times
'other_company' occurs 2 times
请注意,这只有在没有歧义的情况下才能正常工作。例如,如果您的文本同时包含“apple”(水果)和“Apple”(公司),那么使用 spaCy 实体链接器并为其创建训练数据可能会更好。该过程在here 中进行了解释(尽管对于旧版本的 spaCy)。