【问题标题】:How can I calculate the coherence score in the sklearn implementation of NMF?如何计算 NMF 的 sklearn 实现中的连贯性分数?
【发布时间】:2020-04-29 21:20:41
【问题描述】:

我正在尝试构建一个实用程序,其中 NMF 模型每隔几天就会处理一次数据集。为此,在第一次运行中,我提供了主题数量的起始值。如何计算整个数据集的一致性分数?我打算使用这个计算的分数来重建模型,这样它会更准确。下面是我用过的代码。

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import clr
#PLOTTING TOOLS 
# import matplotlib.pyplot as PLOTTING
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

dataset = pd.read_json('out.json', lines = True)
documents = dataset['attachment']

no_features = 1000
no_topics = 9
# print ('Old number of topics: ', no_topics)
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english', norm='l2')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = tfidf.shape
retrain_value = no_topics[0]
# print('New number of topics :', retrain_value)

nmf = NMF(n_components = retrain_value, random_state = 1, alpha = .1, l1_ratio = .5, init = 'nndsvd').fit(tfidf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d: " % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))

no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)

【问题讨论】:

    标签: python machine-learning scikit-learn tfidfvectorizer nmf


    【解决方案1】:

    很遗憾,sklearn.decomposition.NMF 没有开箱即用的一致性模型。

    我也遇到了同样的问题,发现了一个使用 python 3.8 的 custom implementation

    它应该很容易适应您的代码。请检查完整导入等的链接。

    我最近使用该技术的片段:

    kmin, kmax = 2, 30
    
    topic_models = []
    # try each value of k
    for k in range(kmin,kmax+1):
        print("Applying NMF for k=%d ..." % k )
        # run NMF
        model = decomposition.NMF( init="nndsvd", n_components=k ) 
        W = model.fit_transform( A )
        H = model.components_    
        # store for later
        topic_models.append( (k,W,H) )
    
    class TokenGenerator:
        def __init__( self, documents, stopwords ):
            self.documents = documents
            self.stopwords = stopwords
            self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )
    
        def __iter__( self ):
            print("Building Word2Vec model ...")
            for doc in self.documents:
                tokens = []
                for tok in self.tokenizer.findall( doc ):
                    if tok.lower() in self.stopwords:
                        tokens.append( "<stopword>" )
                    elif len(tok) >= 2:
                        tokens.append( tok.lower() )
                yield tokens
    
    docgen = TokenGenerator(docs_raw, stop_words)
    w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)
    
    def calculate_coherence( w2v_model, term_rankings ):
        overall_coherence = 0.0
        for topic_index in range(len(term_rankings)):
            # check each pair of terms
            pair_scores = []
            for pair in combinations( term_rankings[topic_index], 2 ):
                #print(str(pair[0]) + " " + str(pair[1]))
                pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
            # get the mean for all pairs in this topic
            topic_score = sum(pair_scores) / len(pair_scores)
            overall_coherence += topic_score
        # get the mean score across all topics
        return overall_coherence / len(term_rankings)
    
    def get_descriptor( all_terms, H, topic_index, top ):
        # reverse sort the values to sort the indices
        top_indices = np.argsort( H[topic_index,:] )[::-1]
        # now get the terms corresponding to the top-ranked indices
        top_terms = []
        for term_index in top_indices[0:top]:
            top_terms.append( all_terms[term_index] )
        return top_terms
    
    k_values = []
    coherences = []
    for (k,W,H) in topic_models:
        # Get all of the topic descriptors - the term_rankings, based on top 10 terms
        term_rankings = []
        for topic_index in range(k):
            term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
        # Now calculate the coherence based on our Word2vec model
        k_values.append( k )
        coherences.append( calculate_coherence( w2v_model, term_rankings ) )
        print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )
    
    %matplotlib inline
    plt.style.use("ggplot")
    matplotlib.rcParams.update({"font.size": 14})
    
    fig = plt.figure(figsize=(13,7))
    # create the line plot
    ax = plt.plot( k_values, coherences )
    plt.xticks(k_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Mean Coherence")
    # add the points
    plt.scatter( k_values, coherences, s=120)
    # find and annotate the maximum point on the plot
    ymax = max(coherences)
    xpos = coherences.index(ymax)
    best_k = k_values[xpos]
    plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
    # show the plot
    plt.show()
    

    结果:

    K=02: Coherence=0.4157
    K=03: Coherence=0.4399
    K=04: Coherence=0.4626
    K=05: Coherence=0.4333
    K=06: Coherence=0.4075
    K=07: Coherence=0.4121
    ...
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2018-08-26
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2020-06-16
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多