【发布时间】:2020-12-05 13:12:32
【问题描述】:
我在使用 Kmeans 算法标记数据时遇到问题。我的测试句子得到了真正的集群,但我没有得到真正的标签。我已经使用 numpy 将集群与 true_label_test 进行匹配,但是这个 kmeans 可以移动集群,真正的标签与集群的数量不匹配。我需要帮助解决这个问题。这是我的代码
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter
stop = set(stopwords.words('indonesian'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
# Cleaning the text sentences so that punctuation marks, stop words & digits are removed
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
processed = re.sub(r"\d+","",normalized)
y = processed.split()
#print (y)
return y
path = "coba.txt"
train_clean_sentences = []
fp = open(path,'r')
for line in fp:
line = line.strip()
cleaned = clean(line)
cleaned = ' '.join(cleaned)
train_clean_sentences.append(cleaned)
#print(train_clean_sentences)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_clean_sentences)
# Clustering the training 30 sentences with K-means technique
modelkmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
teks_satu = "Aplikasi Machine Learning untuk mengenali daun mangga dengan metode CNN"
test_clean_sentence = []
cleaned_test = clean(teks_satu)
cleaned = ' '.join(cleaned_test)
cleaned = re.sub(r"\d+","",cleaned)
test_clean_sentence.append(cleaned)
Test = vectorizer.transform(test_clean_sentence)
true_test_labels = ['AI','VR','Sistem Informasi']
predicted_labels_kmeans = modelkmeans.predict(Test)
print(predicted_labels_kmeans)
print ("\n-------------------------------PREDICTIONS BY K-Means--------------------------------------")
print ("\nIndex of Virtual Reality : ",Counter(modelkmeans.labels_[5:10]).most_common(1)[0][0])
print ("Index of Machine Learning : ",Counter(modelkmeans.labels_[0:5]).most_common(1)[0][0])
print ("Index of Sistem Informasi : ",Counter(modelkmeans.labels_[10:15]).most_common(1)[0][0])
print ("\n",teks_satu,":",true_test_labels[np.int(predicted_labels_kmeans)],":",predicted_labels_kmeans)
【问题讨论】:
标签: python numpy flask scikit-learn k-means