如何计算 NMF 的 sklearn 实现中的连贯性分数？答案

【问题标题】：How can I calculate the coherence score in the sklearn implementation of NMF?如何计算 NMF 的 sklearn 实现中的连贯性分数？
【发布时间】：2020-04-29 21:20:41
【问题描述】：

我正在尝试构建一个实用程序，其中 NMF 模型每隔几天就会处理一次数据集。为此，在第一次运行中，我提供了主题数量的起始值。如何计算整个数据集的一致性分数？我打算使用这个计算的分数来重建模型，这样它会更准确。下面是我用过的代码。

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import clr
#PLOTTING TOOLS 
# import matplotlib.pyplot as PLOTTING
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

dataset = pd.read_json('out.json', lines = True)
documents = dataset['attachment']

no_features = 1000
no_topics = 9
# print ('Old number of topics: ', no_topics)
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english', norm='l2')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = tfidf.shape
retrain_value = no_topics[0]
# print('New number of topics :', retrain_value)

nmf = NMF(n_components = retrain_value, random_state = 1, alpha = .1, l1_ratio = .5, init = 'nndsvd').fit(tfidf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d: " % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))

no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)

【问题讨论】：

标签： python machine-learning scikit-learn tfidfvectorizer nmf

【解决方案1】：

很遗憾，sklearn.decomposition.NMF 没有开箱即用的一致性模型。

我也遇到了同样的问题，发现了一个使用 python 3.8 的 custom implementation。

它应该很容易适应您的代码。请检查完整导入等的链接。

我最近使用该技术的片段：

kmin, kmax = 2, 30

topic_models = []
# try each value of k
for k in range(kmin,kmax+1):
    print("Applying NMF for k=%d ..." % k )
    # run NMF
    model = decomposition.NMF( init="nndsvd", n_components=k ) 
    W = model.fit_transform( A )
    H = model.components_    
    # store for later
    topic_models.append( (k,W,H) )

class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        print("Building Word2Vec model ...")
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok.lower() in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok.lower() )
            yield tokens

docgen = TokenGenerator(docs_raw, stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)

def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            #print(str(pair[0]) + " " + str(pair[1]))
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

def get_descriptor( all_terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( all_terms[term_index] )
    return top_terms

k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})

fig = plt.figure(figsize=(13,7))
# create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter( k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
# show the plot
plt.show()

结果：

K=02: Coherence=0.4157
K=03: Coherence=0.4399
K=04: Coherence=0.4626
K=05: Coherence=0.4333
K=06: Coherence=0.4075
K=07: Coherence=0.4121
...

【讨论】：