如何使用 Spacy EntityRuler NLP 组合实体术语？答案

【问题标题】：How to combine entity terms using Spacy EntityRuler NLP?如何使用 Spacy EntityRuler NLP 组合实体术语？
【发布时间】：2021-06-05 01:54:36
【问题描述】：

我正在将 spacy 用于一些 NLP 任务，例如计算实体频率和 PMI 分数（组织实体和引理之间的关系排名）。我的语料库中经常包含具有各种排列的特定组织（例如 Harman、HARMAN、Harman International...），我希望始终将其视为一个实体。这样，在计算频率时，它们都被视为一个组织实体，而不是单独的、唯一的实体。

我相信 spacy.pipeline.EntityRuler 应该是编辑和更新 spacy 模块的方法，但我没有得到想要的结果。运行以下代码后，实体列表似乎没有更新。我仍然将组织的各种排列作为唯一实体返回。

目前我不确定自己做错了什么，因此感谢您的帮助！

谢谢。

代码：

import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load('en_core_web_sm', disable = ['parser','tagger'])
ruler = EntityRuler(nlp, overwrite_ents = True) #replace entities that may exist with the following
patterns = [{"label": "ORG", "pattern": [{"TEXT":"HARMAN"}, {"TEXT":"International"}], "id": "harman"},
           {"label": "ORG", "pattern": [{"TEXT":"HARMAN"}], "id": "harman"},
           {"label": "ORG", "pattern": [{"TEXT":"Harman"}], "id": "harman"},
           {"label": "ORG", "pattern": [{"TEXT":"Harman"}, {"TEXT":"International"}], "id": "harman"}
           ] 
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before="ner")
corpus_nlp = [nlp(corpus['Body'][i]) for i in corpus.index]

corpus_nlp[49].ents

（Harman、Zinnov、HARMAN，“领导区”）

【问题讨论】：

标签： python nlp entity spacy

【解决方案1】：

您要做的是将部分文本映射到在 NLP 中称为 entity linking 的真实世界实体。

您可以使用您指定的模式通过实体标尺扩展通常的 spaCy 管道（例如“en_core_web_lg”），以确保所有出现的文本都被识别为实体。您可以进一步编写一个实体链接器（比one provided by spaCy 简单得多），检查知识库中文本实体应该映射到哪个真实世界的实体。

您可以从同一个字典 (ENTITIES_MAP) 创建模式（用于实体标尺）和知识库（用于实体链接器），以便于扩展。

使用 spaCy 3，它可能看起来像这样。（请注意，链接器可以简化。我遵循official component 的一般结构，使其复杂一些。）

from typing import Callable, Optional, List, Iterable, Dict

import spacy
from spacy import Language, Vocab
from spacy.kb import KnowledgeBase
from spacy.tokens import Doc


# Map from entity to how it could look like in a text
# This can be edited in order to change the patterns and kb
ENTITIES_MAP = {
    "harman": [
        "harman",
        "Harman",
        "HARMAN",
        "HARMAN International",
        "Harman International",
    ],
    "other_company": [
        "Other Company",
        "OC"
    ]
}


class CustomLinker:
    def __init__(self, vocab: Vocab) -> None:
        self._vocab = vocab
        self._kb: Optional[KnowledgeBase] = None

    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]) -> None:
        self._kb = kb_loader(self._vocab)

    def __call__(self, doc: Doc) -> Doc:
        kb_ids = self.predict([doc])
        self.set_annotations([doc], kb_ids)
        return doc

    def predict(self, docs: Iterable[Doc]) -> List[str]:
        assert self._kb is not None, "You forgot to call 'set_kb()'"
        assert len(self._kb) > 0, "kb is empty"
        kb_ids = []
        for doc in docs:
            for i, ent in enumerate(doc.ents):
                candidates = self._kb.get_alias_candidates(ent.text)
                if not candidates:
                    kb_ids.append("NIL")
                elif len(candidates) == 1:
                    kb_ids.append(candidates[0].entity_)
                else:
                    assert False, "The kb was set up ambiguously"
        return kb_ids

    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
        count_ents = len([ent for doc in docs for ent in doc.ents])
        assert count_ents == len(
            kb_ids
        ), f"Number of entities is {count_ents}, but number of kb_ids is {len(kb_ids)}"
        i = 0
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
                    token.ent_kb_id_ = kb_id


@Language.factory("custom_linker")
def make_entity_linker(nlp: Language, name: str) -> CustomLinker:
    return CustomLinker(nlp.vocab)


def create_pipeline() -> Language:
    nlp = spacy.load("en_core_web_lg")

    # Entity ruler to make sure all occurrences of harman are recognized as entities at all
    patterns = [
        {"label": "ORG", "pattern": [{"TEXT": word} for word in alias.split(" ")]}
        for alias in [alias for aliases in ENTITIES_MAP.values() for alias in aliases]
    ]
    ruler = nlp.add_pipe("entity_ruler", last=True)
    ruler.add_patterns(patterns)

    # Entity linker to link all harman entities to the same real-world entity
    def create_kb(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=300)
        for name, aliases in ENTITIES_MAP.items():
            kb.add_entity(name, freq=1, entity_vector=vocab[name].vector)
            for alias in aliases:
                kb.add_alias(alias, entities=[name], probabilities=[1.0])
        return kb

    linker = nlp.add_pipe("custom_linker")
    linker.set_kb(create_kb)
    
    return nlp


def main():
    nlp = create_pipeline()

    text = (
        "The company HARMAN International is doing NLP."
        " Some other names for it are harman, Harman, Harman International and HARMAN, but not HI."
        " Other Company is a different one. Some also call it by the abbreviation OC."
    )
    doc = nlp(text)
    
    print([(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
    distinct_real_world_entities = set([ent.kb_id_ for ent in doc.ents]) - {"NIL"}
    for rwe in distinct_real_world_entities:
        count = len([ent for ent in doc.ents if ent.kb_id_ == rwe])
        print(f"'{rwe}' occurs {count} times")


if __name__ == "__main__":
    main()

它产生以下输出：

[('HARMAN International', 'ORG', 'harman'), ('NLP', 'ORG', 'NIL'), ('harman', 'ORG', 'harman'), ('Harman', 'ORG', 'harman'), ('Harman International', 'ORG', 'harman'), ('HARMAN', 'ORG', 'harman'), ('Other Company', 'ORG', 'other_company'), ('OC', 'ORG', 'other_company')]
'harman' occurs 5 times
'other_company' occurs 2 times

请注意，这只有在没有歧义的情况下才能正常工作。例如，如果您的文本同时包含“apple”（水果）和“Apple”（公司），那么使用 spaCy 实体链接器并为其创建训练数据可能会更好。该过程在here 中进行了解释（尽管对于旧版本的 spaCy）。

【讨论】：