也许是这样。
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering
sentences = [["The Lord of the Rings J.R.R Tolkien"],
["Lord of the Rings Good condition"],
["Very good condition Lord of the Rings jrr Tolkien"],
["harry potter and the sorcerer's stone hardcover"],
["JK rowling harry potter and the sorcerer's stone"]]
m = Word2Vec(sentences, size=50, min_count=1, sg=1)
def vectorizer(sent, m):
vec=[]
numw=0
for w in sent:
try:
if numw == 0:
vec = m[w]
else:
vec = np.add(vec, m[w])
numw += 1
except:
pass
return np.asarray(vec)/numw
l=[]
for i in sentences:
l.append(vectorizer(i,m))
X=np.array(l)
n_clusters = 2
clf = KMeans(n_clusters=n_clusters,
max_iter=100,
init='k-means++',
n_init=1)
labels=clf.fit_predict(X)
print(labels)
for index, sentence in enumerate(sentences):
print(str(labels[index]) + ":" + str(sentence))
结果:
0:['The Lord of the Rings J.R.R Tolkien']
0:['Lord of the Rings Good condition']
1:['Very good condition Lord of the Rings jrr Tolkien']
0:["harry potter and the sorcerer's stone hardcover"]
1:["JK rowling harry potter and the sorcerer's stone"]
KMeans 几乎肯定不是对任何类型的文本数据进行聚类的最佳方式。您可能还想查看其他聚类算法。在这种情况下,凝聚聚类可能更稳健。
这很有趣。
例如,如果我改变这个......
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
clf = AgglomerativeClustering(n_clusters=n_clusters,
linkage="average", affinity=metric)
我明白了……
1:['The Lord of the Rings J.R.R Tolkien']
0:['Lord of the Rings Good condition']
0:['Very good condition Lord of the Rings jrr Tolkien']
0:["harry potter and the sorcerer's stone hardcover"]
0:["JK rowling harry potter and the sorcerer's stone"]