【发布时间】:2019-08-25 02:57:15
【问题描述】:
我正在预处理我的文本数据。数据在 CSV 文件(输入文件)中。现在我想打开我的数据,读取它,然后将结果保存在另一个 CSV 文件(输出文件)中。 我尝试了不同的方法并查看了互联网和 StackOverflow,但没有一个能回答我的问题。
import re, string, unicodedata
import nltk
import csv
import inflect
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
lem = WordNetLemmatizer
with open('file.csv', 'r') as csv_file, open('new_file.csv', 'w') as out_file:
reader = csv.reader(csv_file)
writer = csv.writer(out_file)
text = nltk.word_tokenize(text)
def non_ascii(text):
new_words = []
for word in text:
remove_non_ascii_words = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8','ignore')
new_words.append(remove_non_ascii_words)
return new_words
def remove_punct(text):
new_words =[]
for word in text:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def to_lower(text):
new_words = []
for word in text:
new_word = word.lower()
new_words.append(new_word)
return new_words
def replace_numbers(text):
p = inflect.engine()
new_words = []
for word in text:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(text):
new_words = []
for word in text:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(text):
stemmer = LancasterStemmer()
stems = []
for word in text:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(text):
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in text:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(text):
text = non_ascii(text)
text = remove_punct(text)
text = to_lower(text)
text = replace_numbers(text)
text = remove_stopwords(text)
return text
text = normalize(text)
def stem_lemmatize(text):
stems = stem_words(text)
lemmas = lemmatize_verbs(text)
return stems, lemmas
stems, lemmas = stem_lemmatize(text)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)
writer.writerow()
csv_file.close()
我想将结果保存在 CSV 文件中。
【问题讨论】:
标签: python-3.7 import-csv