【发布时间】:2020-12-30 03:31:06
【问题描述】:
我正在尝试使用 tsne 和 pyldavis 作为可视化来执行 lda 主题建模。然而,在获得主导主题的同时执行 lda 后,错误给出了太多的值而无法解包。代码和错误如下。非常感谢任何帮助。
LdaMulticore 主题建模代码:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
def Make_String(text):
return str(text)
#Reviews.columns=['Reviews']
#print(Reviews.head(10))
df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv", encoding="ISO-8859-1")
print(df.shape) #> (2361, 3)
df.head()
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
from gensim.models import LdaMulticore
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
lda_model = LdaMulticore( corpus, num_topics=10, id2word=id2word , passes=2, workers=2)
pprint(lda_model.print_topics())
#> [(0,
#> '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
#> '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
#> '0.007*"question"'),
#> (1,
#> '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
#> '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
#> '0.005*"way" + 0.004*"bible"'),
#> (2,
#> '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
#> '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
#> '0.004*"way" + 0.004*"ride"'),
#> (3,
#> '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
#> '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
#> '0.009*"season"')]
Output:
[(0,
'0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
'0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
'+ 0.011*"next"'),
(1,
'0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
'0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),
(2,
'0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
'0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),
(3,
'0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
'0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
'0.009*"people"'),
(4,
'0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
'0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
'0.006*"photo" + 0.004*"iphone"'),
(5,
'0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
'0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
'0.012*"dlvr"'),
(6,
'0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
'0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),
(7,
'0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
'0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
'0.008*"problem"'),
(8,
'0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
'0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
'0.010*"steal"'),
(9,
'0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
'0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
'0.005*"hour" + 0.005*"people"')]
主要主题的代码:
# Sentence Coloring of N Sentences
def topics_per_document(model, corpus, start=0, end=1):
corpus_sel = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_sel):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_document(model=lda_model, corpus=corpus, end=-1)
# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()
# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False)
for j, (topic, wt) in enumerate(topics) if j < 3]
df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)
错误:
<ipython-input-13-5ea2ada44643> in topics_per_document(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_sel):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
非常感谢
【问题讨论】:
标签: python dataframe dataset data-science topic-modeling