在搜索互联网并获得许多奇怪的评论较少的解决方案之后。我能够弄清楚该怎么做。如果您尝试做类似的事情,这里是代码。它包含来自各种来源的代码,其中很多是由我编写/编辑的。我希望它比其他人更容易理解。
该函数基于来自 scipy 的 kmeans2,它返回 centroid_list 和 label_list。 kmeansdata 是传递给 kmeans2 进行聚类的 numpy 数组,num_clusters 表示传递给 kmeans2 的集群数量。
代码写回一个新的 png 文件,确保它不会覆盖其他内容。还仅绘制 50 个集群(如果您有 1000 个集群,则不要尝试输出所有集群)
(它是为python2.7编写的,我猜应该也适用于其他版本。)
import numpy
import colorsys
import random
import os
from matplotlib.mlab import PCA as mlabPCA
from matplotlib import pyplot as plt
def get_colors(num_colors):
"""
Function to generate a list of randomly generated colors
The function first generates 256 different colors and then
we randomly select the number of colors required from it
num_colors -> Number of colors to generate
colors -> Consists of 256 different colors
random_colors -> Randomly returns required(num_color) colors
"""
colors = []
random_colors = []
# Generate 256 different colors and choose num_clors randomly
for i in numpy.arange(0., 360., 360. / 256.):
hue = i / 360.
lightness = (50 + numpy.random.rand() * 10) / 100.
saturation = (90 + numpy.random.rand() * 10) / 100.
colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
for i in range(0, num_colors):
random_colors.append(colors[random.randint(0, len(colors) - 1)])
return random_colors
def random_centroid_selector(total_clusters , clusters_plotted):
"""
Function to generate a list of randomly selected
centroids to plot on the output png
total_clusters -> Total number of clusters
clusters_plotted -> Number of clusters to plot
random_list -> Contains the index of clusters
to be plotted
"""
random_list = []
for i in range(0 , clusters_plotted):
random_list.append(random.randint(0, total_clusters - 1))
return random_list
def plot_cluster(kmeansdata, centroid_list, label_list , num_cluster):
"""
Function to convert the n-dimensional cluster to
2-dimensional cluster and plotting 50 random clusters
file%d.png -> file where the output is stored indexed
by first available file index
e.g. file1.png , file2.png ...
"""
mlab_pca = mlabPCA(kmeansdata)
cutoff = mlab_pca.fracs[1]
users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff)
centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff)
colors = get_colors(num_cluster)
plt.figure()
plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3])
plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3])
# Plotting 50 clusters only for now
random_list = random_centroid_selector(num_cluster , 50)
# Plotting only the centroids which were randomly_selected
# Centroids are represented as a large 'o' marker
for i, position in enumerate(centroids_2d):
if i in random_list:
plt.scatter(centroids_2d[i, 0], centroids_2d[i, 1], marker='o', c=colors[i], s=100)
# Plotting only the points whose centers were plotted
# Points are represented as a small '+' marker
for i, position in enumerate(label_list):
if position in random_list:
plt.scatter(users_2d[i, 0], users_2d[i, 1] , marker='+' , c=colors[position])
filename = "name"
i = 0
while True:
if os.path.isfile(filename + str(i) + ".png") == False:
#new index found write file and return
plt.savefig(filename + str(i) + ".png")
break
else:
#Changing index to next number
i = i + 1
return