【发布时间】:2018-02-19 17:12:52
【问题描述】:
我想在层次聚类过程中自动化阈值过程,我想要做的是,而不是手动输入阈值,如果集群不在范围内,我如何检查我是否有 30 到 50 范围内的集群30-50,通过代码更改阈值,在python中通过0.1或0.2
import pickle
import re
import string
import sys
# import gensim
# from gensim import corpora
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sch
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_word_complaints import complaint_stop_words
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=1, token_pattern=r'\b\w+\b',
max_features=n_features, stop_words=list(stop), analyzer='word')
X = tfidf_vectorizer.fit_transform(corpus).toarray()
non_zero_features = np.where(np.sum(X, axis=1) != 0)[0]
print("done in %0.3fs." % (time() - t0))
print("pdist ...")
t0 = time()
cos_dist = pdist(X[non_zero_features, :], 'cosine')
print("done in %0.3fs." % (time() - t0))
dists = np.asarray(squareform(cos_dist))
dists[np.isnan(dists)] = 1
# cos_dist[np.isnan(cos_dist)] = 0
# dists[np.argwhere(np.isnan(dists))] = 1
print("linkage ...")
np.savetxt(str_path + "_dist_1.csv", dists, delimiter=',')
# pickle.dump(dists, open(str_path + "_dist.p", "wb"))
t0 = time()
linkage_matrix = linkage(dists, "average")
print("done in %0.3fs." % (time() - t0))
np.savetxt(str_path + "linkage_matrix.csv", linkage_matrix, delimiter=',')
# linkage_matrix = np.loadtxt(str_path + "linkage_matrix.csv", delimiter=',')
# pickle.dump(linkage_matrix, open(str_path + "linkage_matrix.p", "wb"))
dendrogram(linkage_matrix)
# create figure & 1 axis
fig, ax = plt.subplots(nrows=1, ncols=1) # create figure & 1 axis
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
linkage_matrix
# leaf_rotation=90., # rotates the x axis labels
# leaf_font_size=3., # font size for the x axis labels
)
plt.show()
fig.savefig(str_path + 'Agglo_Heirachy_dendo.png') # save the figure to file
min_th = min(linkage_matrix[:,2])
max_th = max(linkage_matrix[:,2])
clusters = get_clusters(linkage_matrix, min_th, max_th)
【问题讨论】:
标签: python scikit-learn nltk hierarchical-clustering