【发布时间】:2026-02-09 18:25:01
【问题描述】:
################################################################
#
# Reddit sentiment script for comments in /r/wallstreetbets
# Loads comments from db, scans comments using word list,
# creates sentiment score and writes results to database
#
################################################################
import sqlite3
import re
from datetime import datetime
# Load key search words from text file db
def read_words_from_file(ftext_file):
load_words = ()
f = open(ftext_file, 'r')
# for item in f:
# load_words.append(item.lower().strip())
load_words = tuple([x.lower().strip() for x in f])
return(load_words)
# Search key words against comments
# Gives +/- for positive or negative sentiment words
def word_search(db_list, fpos_flist, fneg_flist):
pos_count = 0
neg_count = 0
fdb_results_list = []
total_lines_scanned = 0
print("Starting word search...")
# 1st for loop is comment data
# 2nd for loop is key words
for comment in db_list:
total_lines_scanned = total_lines_scanned + 1
for pos_item in fpos_flist:
word_search = re.findall(r"\b" + pos_item + r"\b", comment[0])
pos_count = pos_count + len(word_search)
for neg_item in fneg_flist:
word_search = re.findall(r"\b" + neg_item + r"\b", comment[0])
neg_count = neg_count + len(word_search)
# Determine pos/neg sentiment score based on frequency in comment
if pos_count > neg_count:
pos_count = pos_count / (pos_count+neg_count)
neg_count = 0
elif pos_count < neg_count:
neg_count = neg_count / (pos_count+neg_count)
pos_count = 0
elif pos_count == neg_count:
pos_count = 0
neg_count = 0
if pos_count > 0 or neg_count > 0:
fdb_results_list.append([pos_count, neg_count, comment[1]])
if total_lines_scanned % 100000 == 0:
print("Lines counted so far:", total_lines_scanned)
pos_count = 0
neg_count = 0
print("Word search complete.")
return(fdb_results_list)
# Write results to new DB. Deletes odd db.
# pos = item[0], neg = item[1], timestamp = item[2]
def write_db_results(write_db_list):
print("Writing results to database...")
conn = sqlite3.connect('testdb.sqlite', timeout=30)
cur = conn.cursor()
cur.executescript('''DROP TABLE IF EXISTS redditresultstable
''')
cur.executescript('''
CREATE TABLE redditresultstable (
id INTEGER NOT NULL PRIMARY KEY UNIQUE,
pos_count INTEGER,
neg_count INTEGER,
timestamp TEXT
);
''')
for item in write_db_list:
cur.execute('''INSERT INTO redditresultstable (pos_count, neg_count, timestamp)
VALUES (?, ?, ?)''', (item[0], item[1], item[2]))
conn.commit()
conn.close()
print("Writing results to database complete.")
# Load comments item[2] and timestamp item[4] from db
def load_db_comments():
print("Loading database...")
conn = sqlite3.connect('redditusertrack.sqlite')
cur = conn.cursor()
cur.execute('SELECT * FROM redditcomments')
row_db = cur.fetchall()
conn.close()
print("Loading complete.")
db_list = ()
db_list = tuple([(item[2].lower(), item[4]) for item in row_db])
return db_list
# Main Program Starts Here
print(datetime.now())
db_list = load_db_comments()
pos_word_list = read_words_from_file("simple_positive_words.txt")
neg_word_list = read_words_from_file("simple_negative_words.txt")
db_results_list = word_search(db_list, pos_word_list, neg_word_list)
db_results_list = tuple(db_results_list)
write_db_results(db_results_list)
print(datetime.now())
此脚本从 SQLite 将 130 万个 cmets 加载到内存中,然后针对每条评论扫描 147 个关键字,然后计算情绪得分。约 1.91 亿次迭代。
执行需要 5 分 32 秒
我将大部分变量更改为元组(来自列表)并使用列表推导而不是 For 循环(用于追加)。与仅使用列表和 For 循环追加的脚本相比,这将执行提高了约 5%。 5% 可能是一个误差范围,因为我的测量方法可能不准确。
* 和其他资源似乎表明,在这种类型的迭代中使用元组更快,尽管一些海报提供的证据表明在某些情况下列表更快。
这段代码是否针对使用元组和列表理解进行了正确优化?
编辑:感谢大家的建议/cmets。很多工作要做。我实施了 @YuriyP 的建议,运行时间从 5 多分钟缩短到 26 秒。问题在于正则表达式 For 循环搜索功能。
所附图片中的更新代码。我删除了红色划掉的代码并用绿色更新了它。
【问题讨论】:
-
寻求对工作代码进行一般审查和批评的问题应发送至CodeReview.SE。
-
使用元组代替列表,使用列表推导代替 for 循环不会让你的代码更快
-
您是否分析过您的解决方案以查看瓶颈在哪里? - docs.python.org/3/library/profile.html
-
您为每个正面/负面词制作一个正则表达式模式,并为每个评论执行此操作。那些模式永远不会改变,他们应该做一次;存储和使用。您甚至可以创建一个可以找到任何这些单词的模式。 https://*.com/questions/24178290/python-regex-check-if-string-contains-any-of-words
-
如果 cmets 的单词少于 147 个,看看从评论中制作模式并搜索关键字字符串是否更快。
标签: python regex performance