【发布时间】:2020-04-30 07:17:02
【问题描述】:
我正在关注一篇优秀的媒体文章:https://towardsdatascience.com/k-medoids-clustering-on-iris-data-set-1931bf781e05 从头开始实施 kmedoids。代码中有一个地方计算每个像素到中心点的距离,而且速度非常慢。它在循环中有 numpy.linalg.norm。有没有办法使用 numpy.linalg.norm 或使用 numpy 广播或 scipy.spatial.distance.cdist 和 np.argmin 来优化它来做同样的事情?
###helper function here###
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
这就是减速发生的地方
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(X, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
下面的完整代码。所有致谢文章作者。
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)
target = iris.target_names
labels = iris.target
#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
#PCA Transformation
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])
datapoints = PCAdf.values
m, f = datapoints.shape
k = 3
def init_medoids(X, k):
from numpy.random import choice
from numpy.random import seed
seed(1)
samples = choice(len(X), size=k, replace=False)
return X[samples, :]
medoids_initial = init_medoids(datapoints, 3)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
S = compute_d_p(datapoints, medoids_initial, 2)
def assign_labels(S):
return np.argmin(S, axis=1)
labels = assign_labels(S)
def update_medoids(X, medoids, p):
S = compute_d_p(points, medoids, p)
labels = assign_labels(S)
out_medoids = medoids
for i in set(labels):
avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p))
cluster_points = points[labels == i]
for datap in cluster_points:
new_medoid = datap
new_dissimilarity= np.sum(compute_d_p(points, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
return out_medoids
def has_converged(old_medoids, medoids):
return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
if starting_medoids is None:
medoids = init_medoids(X, k)
else:
medoids = starting_medoids
converged = False
labels = np.zeros(len(X))
i = 1
while (not converged) and (i <= max_steps):
old_medoids = medoids.copy()
S = compute_d_p(X, medoids, p)
labels = assign_labels(S)
medoids = update_medoids(X, medoids, p)
converged = has_converged(old_medoids, medoids)
i += 1
return (medoids,labels)
results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]
【问题讨论】:
-
@senderle 编辑放完整代码