【发布时间】:2021-07-31 11:28:45
【问题描述】:
我跟随 Joel Grus 的“从头开始的数据科学”并使用它编写了我自己的 KMeans 代码(将 Joel 的函数替换为 numpy 函数等)。下面的代码收敛并找到质心,但它们几乎总是在特征空间的中心。经过进一步调查,看起来 while 循环在第二次迭代中退出(即未检测到任何更改)。我不知道为什么,我做错了什么?
import numpy as np
import seaborn as sns
from sklearn.datasets import make_blobs
features, true_labels = make_blobs(
n_samples=200,
centers=3,
cluster_std=2.75,
random_state=42
)
class KMeans:
def __init__(self, k):
self.k = k
self.means = None
self.assignments = None
def classify(feat, centroids):
distances = [np.linalg.norm(feat - cent) for cent in centroids]
label = np.argwhere(distances == min(distances))
return label
def cluster_means(features, assignments, k):
clusters = [features[self.assingments == cluster,:] for cluster in range(k)]
cluster_means = np.array([np.mean(clusters[i], axis=0) for i in range(k)])
return cluster_means
def train(self, features):
self.assignments = np.random.randint(low=0, high=self.k, size=len(features))
while True:
#find the centroids of the k classes
self.means = cluster_means(features, self.assignments, self.k)
new_assignments = [classify(feat, self.means) for feat in features]
#get number of changes
nChanges = len([x1 for x1, x2 in zip(self.assignments, new_assignments) if x1 != x2])
if nChanges == 0:
return
self.assignments = new_assignments
#self.means = cluster_means(features, self.assignments, k)
print(f"changed: {nChanges} / {len(features)}")
km = KMeans(k=3)
km.train(features)
km.means
【问题讨论】: