帮我加速这段代码 - Python答案

【问题标题】：Help me speed-up this code - Python帮我加速这段代码 - Python
【发布时间】：2011-08-05 17:53:01
【问题描述】：

伙计们，我正在编写这个程序，它会遍历推文列表并返回最常用的单词。

我想让它更快，但我想知道您是否可以帮助指出一些我可以提高速度的问题或领域。谢谢

见下面的代码

#import string
import re
from string import punctuation
from operator import itemgetter
import pprint


class Tweet:
    def __init__(self, timestamp, userId, message):
        self.timestamp = timestamp
        self.userId = userId
        self.message = message

    def getDate(self):
        tokens = re.split(' ',  self.timestamp)
        return tokens[0]

    def __repr__(self):
        return "[timestamp=%s userId=%s message=%s]" % (self.timestamp, self.userId, self.message)

outfile  = file 
def readOneTweet(file):

    """ Reads a single tweet from the file, and returns the string containing the tweet.
    This will often just be a single line from the file, but may be more if it ends with a slash.
    """
    lineBuffer = "" 
    while True:
        # TODO: read the line and strip it

        rawLine = file.readline().strip('\n')
        if (len(rawLine)== 0):
            break

        lineBuffer +=rawLine

        if (rawLine[(len(rawLine)-1)]!= "\\"):
            break
    return lineBuffer 



def readTweets():
    tweets = []
    inputfile = raw_input("Enter filename: ")

    # move the try / except around a single tweet.
    # so that we can keep going if we encounter a line with an error.
    try:
        f = open(inputfile , "r")

        while True:
            tweet = readOneTweet(f) # readOneTweet is method
            if not tweet:
                break
            try:
                lineStrip = tweet.rstrip()

                split_word = re.split('\t',  lineStrip.lower()) #('/([^a-z])([A-Z]) ([0-9])/n:.;\]+/', line.lower())

                tweetTime = split_word[1]
                userId = split_word[0]
                message = split_word[2]
                tweets.append(Tweet(tweetTime, userId, message))
                if len(tweets) % 10000 == 0:
                    print 'read', len(tweets), 'tweets'
            except IndexError, e:
                print "bad tweet", tweet
    except IOError: 
        print "file not found!"
    return tweets

######################DATA ##############
"""
- Need to separate tweets
- Obtain information about each tweet - UserID, Time, words
"""

def writeWordFile(word):
    toWrite = 'test.txt'
    fileHandle = open ( toWrite, 'w' )
    for i in word:
        fileHandle.write (i)

def dailyMessages(twt):
    dailyMsg =dict ()
    for i in twt:
        date =i.getDate()
        #print  i.message
        #dailyMsg[date] =messageList
        if dailyMsg.has_key(date):
            dailyMsg[date].append(twt)
        else:
            dailyMsg[date] =[twt]
    #for k, v in dailyMsg.items():
        #print k, v, '\n'
    return dailyMsg    

"""
Takes dailyTweets and perform  word coun. 
"""
def dailyWord(tweetsByDay):
    dailyTweetsWordCount = { }
    for date in tweetsByDay.keys():
        dayTweets =tweetsByDay[date]
        if len(dayTweets) != 0:
            count = wordCount(dayTweets)
            dailyTweetsWordCount[date] = count
    return dailyTweetsWordCount


def wordCount(tweets):
    """Takes a list of tweets and returns a dictionary of counts for words"""
    N = 100
    # we'll build a dictionary mapping each word to a SET of users who have used it
    wordTweeters = {}
    for tweet in tweets:
       # print tweet
        for i in tweet:
            for word in i.message.split():
                if not  wordTweeters.has_key(word):
                    wordTweeters[word] = set()
                wordTweeters[word].add(i.userId)

    # we'll build up a dictionary mapping each word to the number of users who have used it.
    p = dict ()
    #print wordTweeters
    for day in wordTweeters.keys():
        usersNo = len (wordTweeters[day])
        p[day] = usersNo
    #print wordTweeters

    return  p  #wordTweeters, p

def searchForMemes(tweetUserCounts):
    for  key in tweetsUserCounts.keys():
       # for pmeme in tweetUserCounts
       pass


    """Takes information returned by daily word"""


def isMeme(word, day1Count, day2Count, day3Count):

    #takes the daily count
    # check if it is a meme
    #First -  check count
        #check count in different days
        # determine the if it qualifies as a tweet
        # if not drop it  do not do below checks 
    #Second - time stamp
        #CHECK ITS TIME TRACK
        #How is the count of over time
        # rise and fall
        # 
    #Third - user id
        # check if is form different users
            #how many of those counts are from different users
       pass 

def dayUserCount(z,word, d1, d2, d3):
    """ assume dictionary will be input"""

    # check if the word exist in the dictionary

    if z.has_key(d1):
        date1 =z[d1]
        #print value.keys()
        if  date1.has_key(word):
            print date1
            c1 =date1[word]
        else:
            print "word not used in %s"%d1
            c1 =0
    else:
        print 'date does not exist'

    if z.has_key(d2):
        #print value.keys()
        date2 =z[d2]
        if  date2.has_key(word):
            print date2
            c2 =date2[word]
        else:
            print "word not used in %s"%d2
            c2 =0
    else:
        print 'date does not exist'

    if z.has_key(d3):
        date3 = z[d3]
        if date3.has_key(word):
            print date3
            c3 =date3[word]
        else:
            print "word not used in %s" %d3
            c3 =0
    else:
        print 'date does not exist'

    result = "Word: %s , %s count: %s, %s count: %s, %s count: %s"%(word,d1,c1,d2,c2, d3,c3)
    return result           



# supportive functions 
def hashtag(tw):
    hasht =[]
    for word in tw.split():
        if word.startswith('#'):
            hasht.append(word)
    return hasht


def httpTag(tw):
    http =[]
    for word in tw.split():
        if word.startswith('http'):
            http.append(word)
    return http

def reply(tw):
    reply =[]
    for word in tw.split():
        if word.startswith('@'):
            reply.append(word)
    return reply

def reTweet(tw):
    rt =[]
    for word in tw.split():
        if word.startswith('rt') or word.startswith('RT'):
            rt.append(word)
    return rt






"""
Old functions 
"""
def writeToFile(tweet):
    #filename = test.txt
    filek = open('test.txt', 'w')
    print "writing on the file: "
    filek.write(tweet)
   # print " _____--------______" + tweet
    filek.close()

# count word frequency.
def f2count():
    N = 100000000000
    words = {}
    words_gen = (word.strip(punctuation).lower()
                 for line in open('c.txt')
                        for word in line.split())
    for word in words_gen:
        words[word] = words.get(word, 0) + 1

    top_words = sorted(words.iteritems(),
                   key=lambda(word, count): (-count, word))[:N] 

    for word, frequency in top_words:
        print "%s: %d" % (word, frequency)

【问题讨论】：

profiler 说了什么？
你刚刚在这里转储了 267 行代码。哪个部分有性能问题？你分析过它吗？
回滚到带有代码的修订版，因为已经发布了引用它的答案。
这个问题最好在 codereview.stackexchange.com 上问

标签： python algorithm dictionary performance readlines

【解决方案1】：

这段代码充满了未优化的sn-ps。

例如，函数的每次调用都需要时间。不要做一些无用的函数调用，你会腾出时间。有些调用可以用理解列表代替：hashtag,httpTag, ...等

我可以帮助优化，但是：

1 - 我目前没有足够的时间进行这种长时间的工作

2 - 由于代码不完整，我们无法优化：以下函数在哪里调用？：

readTweets
writeWordFile
dailyMessages
dailyWord
wordCount
searchForMemes
isMeme
dayUserCount
hashtag
httpTag
reply
reTweet
writeToFile
f2count

3 - 我厌倦了回答新注册的人，这些人在 stackoverflow 上出现了很多问题，然后就消失了，有时甚至没有发表任何新闻或评论。如果您不打算这样做，请原谅

编辑

写入文件

f2count

显然必须从列表中删除。

【讨论】：

查看代码。 F2 count 和 writeToFile 不会在任何地方调用，如果我决定使用它们，这些是可选方法。感谢您的反馈。我在这里很新，希望从反馈和 cmets 中了解更多
@Samsoni Runga 对 f2count 和 writeToFile 没问题，将它们放在列表中是愚蠢的。但是其他的呢？ “查看代码” 看看什么？

【解决方案2】：

wordCount() 可以并行运行。由于每条推文不直接依赖于另一条推文，因此没有理由对列表进行连续迭代。将推文列表分解成更小的列表，然后在每个子列表上创建一个线程。一旦他们都完成了子词典的创建，您可以做一些工作将它们全部合并到一个词典中。

编辑：
如何并行化列表求和的示例。您将更改线程的主体以执行您的任务。

from threading import Thread

numbers = range(1000)

class Sum(Thread):
  def __init__(self, numList):
    Thread.__init__(self)
    self.numList = numList
    self.total = 0

  def run(self):
    for num in self.numList:
      self.total += num

numThreads = 7
threads = []
perThread = len(numbers)/numThreads
for i in xrange(numThreads):
  start = i*perThread
  t = Sum(numbers[start:len(numbers) if i == numThreads-1 else start+perThread])
  t.start()
  threads.append(t)

grandTotal = 0
for t in threads:
  t.join()
  grandTotal += t.total

print grandTotal

【讨论】：

谢谢，我该怎么做？我是python的新手，这个程序可以扫描小于300mb的文件，但如果更多，这个程序需要数年时间。问候

【解决方案3】：

if (len(rawline) == 0):

可以写成

if rawline:

你不应该使用len(rawline) - 1作为索引，只使用rawline[-1]。

我不知道你为什么使用re.split()，而你可以只使用linestip.lower().split('\t')。

不要使用dailyMsg.has_key(date)，使用date in dailyMsg。

当你迭代 tweetsByDay 时，你真的应该这样做：

for date, value in tweetsByDay.items():`

这样您就不必手动将值绑定到键。

这只是一个开始。还有很多问题需要解决。我认为你真的只需要努力掌握 Python ——从阅读你的代码中可以清楚地看出 Python 不是你的第一语言，或者你从一个没有教你如何写好它的资源中学习。例如，为什么要在条件句周围加上括号？这在 Python 中不是必需的（尽管它是来自 C 或 Java 等类似 Algol 的语言的产物）。为什么你使用dict() 而不是{}？最好用第二种方式写一个空的字典。您可能会发现 this tutorial on idiomatic Python 很有帮助。

【讨论】：

非常感谢！！是的，我是新手，我想学习如何编码，这就是我寻求帮助的原因！我感谢 cmets 和反馈
如果出现此错误 if (rawLine[-1]!= "\\"): IndexError: string index out of range
@Samsoni 可能是因为字符串为空。