当您只能使用 Python 内置函数时,尽量不要使用 pandas 创建新的临时对象。我不知道您要解决的问题,但如果我只是清理在我看来是冗余的部分,代码运行速度会快 9 倍(0.045 对 0.410 秒):
import pandas
from fuzzywuzzy import process
from operator import itemgetter
import time
words_data_set = pandas.DataFrame({
'keywords': ['wlmart womens book set',
'microsoft fish sauce',
'books from walmat store',
'mens login for facebook fools',
'mens login for facbook fools',
'login for twetter boy',
'apples from cook']})
company_name_list = [
'walmart', 'microsoft', 'facebook', 'twitter', 'amazon', 'apple']
print(len(words_data_set), '....rows')
start_time = time.time()
fuzzed_data_final = pandas.DataFrame()
for s in words_data_set.keywords.tolist():
step3 = pandas.DataFrame(s.split())
step3.columns = ['search_words']
step3['keywords'] = s
fuzzed_data = pandas.DataFrame()
for w in step3.search_words.tolist():
step4 = step3[step3.search_words == w]
if max(process.extract(w, company_name_list), key=itemgetter(1))[1] >= 90:
w = ''
default = pandas.options.mode.chained_assignment
pandas.options.mode.chained_assignment = None
step4['search_words'] = w
pandas.options.mode.chained_assignment = default
fuzzed_data = fuzzed_data.append(step4)
fuzzed_data_final = fuzzed_data_final.append(fuzzed_data)
print("--- %s seconds ---" % (time.time() - start_time))
print(fuzzed_data_final)
现在输出:
7 ....rows
--- 0.04493832588195801 seconds ---
search_words keywords
0 wlmart womens book set
1 womens wlmart womens book set
2 wlmart womens book set
3 set wlmart womens book set
0 microsoft fish sauce
1 fish microsoft fish sauce
2 sauce microsoft fish sauce
0 books books from walmat store
1 from books from walmat store
2 books from walmat store
3 store books from walmat store
0 mens mens login for facebook fools
1 login mens login for facebook fools
2 for mens login for facebook fools
3 mens login for facebook fools
4 fools mens login for facebook fools
0 mens mens login for facbook fools
1 login mens login for facbook fools
2 for mens login for facbook fools
3 mens login for facbook fools
4 fools mens login for facbook fools
0 login login for twetter boy
1 for login for twetter boy
2 twetter login for twetter boy
3 boy login for twetter boy
0 apples from cook
1 from apples from cook
2 cook apples from cook
Process finished with exit code 0
之前的输出:
7 ....rows
/Users/alex/PycharmProjects/game/pandas_double_for_loop_original.py:18: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
step1['keywords2'] = step1.keywords.str.split()
/Users/alex/PycharmProjects/game/pandas_double_for_loop_original.py:36: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
step4['search_words'] = w
--- 0.4108889102935791 seconds ---
search_words keywords
0 wlmart womens book set
1 womens wlmart womens book set
2 wlmart womens book set
3 set wlmart womens book set
0 microsoft fish sauce
1 fish microsoft fish sauce
2 sauce microsoft fish sauce
0 books books from walmat store
1 from books from walmat store
2 books from walmat store
3 store books from walmat store
0 mens mens login for facebook fools
1 login mens login for facebook fools
2 for mens login for facebook fools
3 mens login for facebook fools
4 fools mens login for facebook fools
0 mens mens login for facbook fools
1 login mens login for facbook fools
2 for mens login for facbook fools
3 mens login for facbook fools
4 fools mens login for facbook fools
0 login login for twetter boy
1 for login for twetter boy
2 twetter login for twetter boy
3 boy login for twetter boy
0 apples from cook
1 from apples from cook
2 cook apples from cook
Process finished with exit code 0
更新:关于双循环效率的答案。这是第 2 版程序:
import pandas
from fuzzywuzzy import process
import time
lines = [
'wlmart womens book set', 'microsoft fish sauce',
'books from walmat store', 'mens login for facebook fools',
'mens login for facbook fools', 'login for twetter boy',
'apples from cook'
]
companies = ['walmart', 'microsoft', 'facebook', 'twitter', 'amazon', 'apple']
fuzzed_data_final = pandas.DataFrame()
lines_results = []
def part0():
counter = 0
for line in lines:
for word in line.split():
counter += 1
print('Part 0. Count all words.\n', counter, 'words')
def part1():
for line in lines:
line_results = []
for word in line.split():
match_score_list = process.extractBests(
word, companies, score_cutoff=90, limit=1)
line_results.append(True if match_score_list else False)
lines_results.append(line_results)
print('Part 1. Match all words.\n', lines_results)
def part2():
global fuzzed_data_final
for i, line in enumerate(lines):
step3 = pandas.DataFrame(line.split())
step3.columns = ['search_words']
step3['keywords'] = line
fuzzed_data = pandas.DataFrame()
for j, word in enumerate(line.split()):
step4 = step3[step3.search_words == word]
w = word
if lines_results[i][j]:
w = ''
default = pandas.options.mode.chained_assignment
pandas.options.mode.chained_assignment = None
step4['search_words'] = w
pandas.options.mode.chained_assignment = default
fuzzed_data = fuzzed_data.append(step4)
fuzzed_data_final = fuzzed_data_final.append(fuzzed_data)
print('Part 2. Create pandas.DataFrame fuzzed_data_final.\n',
fuzzed_data_final)
def execute(f):
start_time = time.perf_counter()
f()
total_time = time.perf_counter() - start_time
print("--- %f seconds ---" % total_time)
rows = 1
names = 2000
e = total_time / len(lines) / len(companies) * rows * 1000000. * names
h = e / 3600
d = h / 24
print('Time estimation for %d million rows and %d company names: %d seconds or'
' %d hours or %d days'
% (rows, names, e, h, d))
execute(part0)
execute(part1)
execute(part2)
输出:
Part 0. Count all words.
28 words
--- 0.000032 seconds ---
Time estimation for 1 million rows and 2000 company names: 1534 seconds or 0 hours or 0 days
Part 1. Match all words.
[[True, False, True, False], [True, False, False], [False, False, True, False], [False, False, False, True, False], [False, False, False, True, False], [False, False, False, False], [True, False, False]]
--- 0.006723 seconds ---
Time estimation for 1 million rows and 2000 company names: 320165 seconds or 88 hours or 3 days
Part 2. Create pandas.DataFrame fuzzed_data_final.
search_words keywords
0 wlmart womens book set
1 womens wlmart womens book set
2 wlmart womens book set
3 set wlmart womens book set
0 microsoft fish sauce
1 fish microsoft fish sauce
2 sauce microsoft fish sauce
0 books books from walmat store
1 from books from walmat store
2 books from walmat store
3 store books from walmat store
0 mens mens login for facebook fools
1 login mens login for facebook fools
2 for mens login for facebook fools
3 mens login for facebook fools
4 fools mens login for facebook fools
0 mens mens login for facbook fools
1 login mens login for facbook fools
2 for mens login for facbook fools
3 mens login for facbook fools
4 fools mens login for facbook fools
0 login login for twetter boy
1 for login for twetter boy
2 twetter login for twetter boy
3 boy login for twetter boy
0 apples from cook
1 from apples from cook
2 cook apples from cook
--- 0.042164 seconds ---
Time estimation for 1 million rows and 2000 company names: 2007804 seconds or 557 hours or 23 days
Process finished with exit code 0
因此,仅阅读 100 万行并计算所有单词大约需要半小时。 88 小时模糊匹配所有单词,23 天创建 fuzzed_data_final 大约 4,000,0000 行。看看能不能优化一下。
更新 #2:优化创建 fuzzed_data_final
import pandas
from fuzzywuzzy import process
import time
lines = [
'wlmart womens book set', 'microsoft fish sauce',
'books from walmat store', 'mens login for facebook fools',
'mens login for facbook fools', 'login for twetter boy',
'apples from cook'
]
companies = ['walmart', 'microsoft', 'facebook', 'twitter', 'amazon', 'apple']
start_time = time.perf_counter()
keywords = []
search_words = []
for line in lines:
line_results = []
for word in line.split():
match_score_list = process.extractBests(
word, companies, score_cutoff=90, limit=1)
keywords.append(line)
search_words.append('' if match_score_list else word)
fuzzed_data_final = pandas.DataFrame(
{ 'search_words': pandas.Series(search_words),
'keywords': pandas.Series(keywords) })
total_time = time.perf_counter() - start_time
print("--- %f seconds ---" % total_time)
rows = 1
names = 2000
e = total_time / len(lines) / len(companies) * rows * 1000000. * names
h = e / 3600
d = h / 24
print('Time estimation for %d million rows and %d company names: %d seconds or'
' %d hours or %d days'
% (rows, names, e, h, d))
print(fuzzed_data_final)
输出:
/usr/local/bin/python3.7 /Users/alex/PycharmProjects/game/pandas_doble_for_loop_v3.py
--- 0.008402 seconds ---
Time estimation for 1 million rows and 2000 company names: 400107 seconds or 111 hours or 4 days
search_words keywords
0 wlmart womens book set
1 womens wlmart womens book set
2 wlmart womens book set
3 set wlmart womens book set
4 microsoft fish sauce
5 fish microsoft fish sauce
6 sauce microsoft fish sauce
7 books books from walmat store
8 from books from walmat store
9 books from walmat store
10 store books from walmat store
11 mens mens login for facebook fools
12 login mens login for facebook fools
13 for mens login for facebook fools
14 mens login for facebook fools
15 fools mens login for facebook fools
16 mens mens login for facbook fools
17 login mens login for facbook fools
18 for mens login for facbook fools
19 mens login for facbook fools
20 fools mens login for facbook fools
21 login login for twetter boy
22 for login for twetter boy
23 twetter login for twetter boy
24 boy login for twetter boy
25 apples from cook
26 from apples from cook
27 cook apples from cook
Process finished with exit code 0
比原始版本快 47 倍。我看到了一个额外的技巧来提高 1,000,000 行文本的性能:使用匹配词的字典。良好的词汇量约为 20,000 个单词。每行大约有 10 个单词。因此,每个单词平均重复 10,000,000/20,000 = 500 次。
更新 #3:为匹配的单词添加了字典
import pandas
from fuzzywuzzy import process
import time
lines = [
'wlmart womens book set', 'microsoft fish sauce',
'books from walmat store', 'mens login for facebook fools',
'mens login for facbook fools', 'login for twetter boy',
'apples from cook'
]
companies = ['walmart', 'microsoft', 'facebook', 'twitter', 'amazon', 'apple']
start_time = time.perf_counter()
keywords = []
search_words = []
dictionary = {}
for line in lines:
for word in line.split():
if word in dictionary:
score = dictionary[word]
else:
match_score_list = process.extractBests(
word, companies, score_cutoff=90, limit=1)
score = True if match_score_list else False
dictionary[word] = True if match_score_list else False
keywords.append(line)
search_words.append('' if score else word)
fuzzed_data_final = pandas.DataFrame(
{'search_words': pandas.Series(search_words),
'keywords': pandas.Series(keywords)})
total_time = time.perf_counter() - start_time
print("--- %f seconds ---" % total_time)
rows = 1
names = 2000
e = total_time / len(lines) / len(companies) * rows * 1000000. * names
h = e / 3600
d = h / 24
print('Time estimation for %d million rows and %d company names: %d seconds or'
' %d hours or %d days' % (rows, names, e, h, d))
print(fuzzed_data_final)
输出:
/usr/local/bin/python3.7 /Users/alex/PycharmProjects/game/pandas_doble_for_loop_v4.py
--- 0.005707 seconds ---
Time estimation for 1 million rows and 2000 company names: 271761 seconds or 75 hours or 3 days
search_words keywords
0 wlmart womens book set
1 womens wlmart womens book set
2 wlmart womens book set
3 set wlmart womens book set
4 microsoft fish sauce
5 fish microsoft fish sauce
6 sauce microsoft fish sauce
7 books books from walmat store
8 from books from walmat store
9 books from walmat store
10 store books from walmat store
11 mens mens login for facebook fools
12 login mens login for facebook fools
13 for mens login for facebook fools
14 mens login for facebook fools
15 fools mens login for facebook fools
16 mens mens login for facbook fools
17 login mens login for facbook fools
18 for mens login for facbook fools
19 mens login for facbook fools
20 fools mens login for facbook fools
21 login login for twetter boy
22 for login for twetter boy
23 twetter login for twetter boy
24 boy login for twetter boy
25 apples from cook
26 from apples from cook
27 cook apples from cook
Process finished with exit code 0
它比原始脚本快 69 倍。我们能做到 100 吗?