【问题标题】:Unable to download the Twitter sentiment corpus by Niek Sanders无法下载 Niek Sanders 的 Twitter 情绪语料库
【发布时间】:2014-11-12 03:30:58
【问题描述】:

我正在关注有关 Twitter 情绪分析的教程。我已经在http://www.sananalytics.com/lab/twitter-sentiment/ 下载了代码。我按照步骤从 cmd 提示符运行 install.py,虽然它确实在“rawdata”文件夹中创建了 json 文件,但当我查看这些 json 文件时,它说:

{
    "errors": [
        {
            "message": "SSL is required",
            "code": 92
        }
    ]
}

install.py代码如下:

#
# Sanders-Twitter Sentiment Corpus Install Script
# Version 0.1
#
# Pulls tweet data from Twitter because ToS prevents distributing it directly.
#
# Right now we use unauthenticated requests, which are rate-limited to 150/hr.
# We use 125/hr to stay safe.  
#
# We could more than double the download speed by using authentication with
# OAuth logins.  But for now, this is too much of a PITA to implement.  Just let
# the script run over a weekend and you'll have all the data.
#
#   - Niek Sanders
#     njs@sananalytics.com
#     October 20, 2011
#
#
# Excuse the ugly code.  I threw this together as quickly as possible and I
# don't normally code in Python.
#
import csv, getpass, json, os, time, urllib


def get_user_params():

    user_params = {}

    # get user input params
    user_params['inList']  = raw_input( '\nInput file [./corpus.csv]: ' )
    user_params['outList'] = raw_input( 'Results file [./full-corpus.csv]: ' )
    user_params['rawDir']  = raw_input( 'Raw data dir [./rawdata/]: ' )

    # apply defaults
    if user_params['inList']  == '': 
        user_params['inList'] = './corpus.csv'
    if user_params['outList'] == '': 
        user_params['outList'] = './full-corpus.csv'
    if user_params['rawDir']  == '': 
        user_params['rawDir'] = './rawdata/'

    return user_params


def dump_user_params( user_params ):

    # dump user params for confirmation
    print 'Input:    '   + user_params['inList']
    print 'Output:   '   + user_params['outList']
    print 'Raw data: '   + user_params['rawDir']
    return


def read_total_list( in_filename ):

    # read total fetch list csv
    fp = open( in_filename, 'rb' )
    reader = csv.reader( fp, delimiter=',', quotechar='"' )

    total_list = []
    for row in reader:
        total_list.append( row )

    return total_list


def purge_already_fetched( fetch_list, raw_dir ):

    # list of tweet ids that still need downloading
    rem_list = []

    # check each tweet to see if we have it
    for item in fetch_list:

        # check if json file exists
        tweet_file = raw_dir + item[2] + '.json'
        if os.path.exists( tweet_file ):

            # attempt to parse json file
            try:
                parse_tweet_json( tweet_file )
                print '--> already downloaded #' + item[2]
            except RuntimeError:
                rem_list.append( item )
        else:
            rem_list.append( item )

    return rem_list


def get_time_left_str( cur_idx, fetch_list, download_pause ):

    tweets_left = len(fetch_list) - cur_idx
    total_seconds = tweets_left * download_pause

    str_hr = int( total_seconds / 3600 )
    str_min = int((total_seconds - str_hr*3600) / 60)
    str_sec = total_seconds - str_hr*3600 - str_min*60

    return '%dh %dm %ds' % (str_hr, str_min, str_sec)


def download_tweets( fetch_list, raw_dir ):

    # ensure raw data directory exists
    if not os.path.exists( raw_dir ):
        os.mkdir( raw_dir )

    # stay within rate limits
    max_tweets_per_hr  = 125
    download_pause_sec = 3600 / max_tweets_per_hr

    # download tweets
    for idx in range(0,len(fetch_list)):

        # current item
        item = fetch_list[idx]

        # print status
        trem = get_time_left_str( idx, fetch_list, download_pause_sec )
        print '--> downloading tweet #%s (%d of %d) (%s left)' % \
              (item[2], idx+1, len(fetch_list), trem)

        # pull data
        url = 'http://api.twitter.com/1/statuses/show.json?id=' + item[2]
        urllib.urlretrieve( url, raw_dir + item[2] + '.json' )

        # stay in Twitter API rate limits 
        print '    pausing %d sec to obey Twitter API rate limits' % \
              (download_pause_sec)
        time.sleep( download_pause_sec )

    return


def parse_tweet_json( filename ):

    # read tweet
    print 'opening: ' + filename
    fp = open( filename, 'rb' )

    # parse json
    try:
        tweet_json = json.load( fp )
    except ValueError:
        raise RuntimeError('error parsing json')

    # look for twitter api error msgs
    if 'error' in tweet_json:
        raise RuntimeError('error in downloaded tweet')

    # extract creation date and tweet text
    return [ tweet_json['created_at'], tweet_json['text'] ]


def build_output_corpus( out_filename, raw_dir, total_list ):

    # open csv output file
    fp = open( out_filename, 'wb' )
    writer = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\',
                         quoting=csv.QUOTE_ALL )

    # write header row
    writer.writerow( ['Topic','Sentiment','TweetId','TweetDate','TweetText'] )

    # parse all downloaded tweets
    missing_count = 0
    for item in total_list:

        # ensure tweet exists
        if os.path.exists( raw_dir + item[2] + '.json' ):

            try: 
                # parse tweet
                parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' )
                full_row = item + parsed_tweet

                # character encoding for output
                for i in range(0,len(full_row)):
                    full_row[i] = full_row[i].encode("utf-8")

                # write csv row
                writer.writerow( full_row )

            except RuntimeError:
                print '--> bad data in tweet #' + item[2]
                missing_count += 1

        else:
            print '--> missing tweet #' + item[2]
            missing_count += 1

    # indicate success
    if missing_count == 0:
        print '\nSuccessfully downloaded corpus!'
        print 'Output in: ' + out_filename + '\n'
    else: 
        print '\nMissing %d of %d tweets!' % (missing_count, len(total_list))
        print 'Partial output in: ' + out_filename + '\n'

    return


def main():

    # get user parameters
    user_params = get_user_params()
    dump_user_params( user_params )

    # get fetch list
    total_list = read_total_list( user_params['inList'] )
    fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )

    # start fetching data from twitter
    download_tweets( fetch_list, user_params['rawDir'] )

    # second pass for any failed downloads
    print '\nStarting second pass to retry any failed downloads';
    fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )
    download_tweets( fetch_list, user_params['rawDir'] )

    # build output corpus
    build_output_corpus( user_params['outList'], user_params['rawDir'], 
                         total_list )

    return


if __name__ == '__main__':
    main()

【问题讨论】:

    标签: python-2.7 twitter sentiment-analysis


    【解决方案1】:

    对于任何其他谨慎的旅行者......

    我注意到 KubiK888 没有链接他找到更新代码的位置。

    A) 这是我在 github 上找到的 CSV 的完整上传 - https://raw.githubusercontent.com/zfz/twitter_corpus/master/full-corpus.csv

    似乎有全部 6000 多条推文,在删除“不相关”推文后,它有 3000 多条观察结果。

    B) 或者,这里是一个包含完整代码的存储库,其中有人更新了 Nick Sanders 的原始 0.1 版本以支持 twitter API 1.1(包括 oAUTH)

    https://github.com/aweiand/TwitterSentiment/blob/71c007948b8fb854b1df0b2a3a32d2629653e74b/GetTwitterCorpus/getTweets.py

    它还有各种格式的完整语料库: https://github.com/aweiand/TwitterSentiment/tree/71c007948b8fb854b1df0b2a3a32d2629653e74b/GetTwitterCorpus

    【讨论】:

      【解决方案2】:

      显然,我发布的仍然在他们网站上的代码已经过时了。我从 Github 找到了更新的版本如下:

      # This code is supporting material for the book
      # Building Machine Learning Systems with Python
      # by Willi Richert and Luis Pedro Coelho
      # published by PACKT Publishing
      #
      # It is made available under the MIT License
      
      #
      # Sanders-Twitter Sentiment Corpus Install Script
      # Version 0.1
      #
      # Pulls tweet data from Twitter because ToS prevents distributing it directly.
      #
      #   - Niek Sanders
      #     njs@sananalytics.com
      #     October 20, 2011
      #
      #
      
      # In Sanders' original form, the code was using Twitter API 1.0.
      # Now that Twitter moved to 1.1, we had to make a few changes.
      # Cf. twitterauth.py for the details.
      
      # Regarding rate limiting, please check
      # https://dev.twitter.com/rest/public/rate-limiting
      
      import sys
      import csv
      import json
      import os
      import time
      
      try:
          import twitter
      except ImportError:
          print("""\
      You need to ...
          pip install twitter
      If pip is not found you might have to install it using easy_install.
      If it does not work on your system, you might want to follow instructions
      at https://github.com/sixohsix/twitter, most likely:
        $ git clone https://github.com/sixohsix/twitter
        $ cd twitter
        $ sudo python setup.py install
      """)
      
          sys.exit(1)
      
      ckey = "xxxxx"
      csecret = "xxxxx"
      atoken = "xxxxx"
      asecret = "xxxxx"
      
      api = twitter.Twitter(auth=twitter.OAuth(consumer_key=ckey, consumer_secret=csecret,
                                               token=atoken, token_secret=asecret))
      
      DATA_PATH = "data"
      
      # for some reasons TWeets disappear. In this file we collect those
      MISSING_ID_FILE = os.path.join(DATA_PATH, "missing.tsv")
      NOT_AUTHORIZED_ID_FILE = os.path.join(DATA_PATH, "not_authorized.tsv")
      
      
      def get_user_params(DATA_PATH):
      
          user_params = {}
      
          # get user input params
          user_params['inList'] = os.path.join(DATA_PATH, 'corpus.csv')
          user_params['outList'] = os.path.join(DATA_PATH, 'full-corpus.csv')
          user_params['rawDir'] = os.path.join(DATA_PATH, 'rawdata/')
      
          # apply defaults
          if user_params['inList'] == '':
              user_params['inList'] = './corpus.csv'
          if user_params['outList'] == '':
              user_params['outList'] = './full-corpus.csv'
          if user_params['rawDir'] == '':
              user_params['rawDir'] = './rawdata/'
      
          return user_params
      
      
      def dump_user_params(user_params):
      
          # dump user params for confirmation
          print('Input:    ' + user_params['inList'])
          print('Output:   ' + user_params['outList'])
          print('Raw data: ' + user_params['rawDir'])
      
      
      def read_total_list(in_filename):
      
          # read total fetch list csv
          fp = open(in_filename, 'rt')
          reader = csv.reader(fp, delimiter=',', quotechar='"')
      
          if os.path.exists(MISSING_ID_FILE):
              missing_ids = [line.strip()
                             for line in open(MISSING_ID_FILE, "r").readlines()]
          else:
              missing_ids = []
      
          if os.path.exists(NOT_AUTHORIZED_ID_FILE):
              not_authed_ids = [line.strip()
                                for line in open(NOT_AUTHORIZED_ID_FILE, "r").readlines()]
          else:
              not_authed_ids = []
      
          print("We will skip %i tweets that are not available or visible any more on twitter" % (
              len(missing_ids) + len(not_authed_ids)))
      
          ignore_ids = set(missing_ids + not_authed_ids)
          total_list = []
      
          for row in reader:
              if row[2] not in ignore_ids:
                  total_list.append(row)
      
          return total_list
      
      
      def purge_already_fetched(fetch_list, raw_dir):
      
          # list of tweet ids that still need downloading
          rem_list = []
          count_done = 0
      
          # check each tweet to see if we have it
          for item in fetch_list:
      
              # check if json file exists
              tweet_file = os.path.join(raw_dir, item[2] + '.json')
              if os.path.exists(tweet_file):
      
                  # attempt to parse json file
                  try:
                      parse_tweet_json(tweet_file)
                      count_done += 1
                  except RuntimeError:
                      print("Error parsing", item)
                      rem_list.append(item)
              else:
                  rem_list.append(item)
      
          print("We have already downloaded %i tweets." % count_done)
      
          return rem_list
      
      
      def download_tweets(fetch_list, raw_dir):
      
          # ensure raw data directory exists
          if not os.path.exists(raw_dir):
              os.mkdir(raw_dir)
      
          # download tweets
          for idx in range(0, len(fetch_list)):
              # current item
              item = fetch_list[idx]
              print(item)
      
              print('--> downloading tweet #%s (%d of %d)' %
                    (item[2], idx + 1, len(fetch_list)))
      
              try:
                  #import pdb;pdb.set_trace()
                  response = api.statuses.show(_id=item[2])
      
                  if response.rate_limit_remaining <= 0:
                      wait_seconds = response.rate_limit_reset - time.time()
                      print("Rate limiting requests us to wait %f seconds" %
                            wait_seconds)
                      time.sleep(wait_seconds+5)
      
              except twitter.TwitterError as e:
                  fatal = True
                  print(e)
                  for m in json.loads(e.response_data.decode())['errors']:
                      if m['code'] == 34:
                          print("Tweet missing: ", item)
                          with open(MISSING_ID_FILE, "at") as f:
                              f.write(item[2] + "\n")
      
                          fatal = False
                          break
                      elif m['code'] == 63:
                          print("User of tweet '%s' has been suspended." % item)
                          with open(MISSING_ID_FILE, "at") as f:
                              f.write(item[2] + "\n")
      
                          fatal = False
                          break
                      elif m['code'] == 88:
                          print("Rate limit exceeded.")
                          fatal = True
                          break
                      elif m['code'] == 179:
                          print("Not authorized to view this tweet.")
                          with open(NOT_AUTHORIZED_ID_FILE, "at") as f:
                              f.write(item[2] + "\n")
                          fatal = False
                          break
      
                  if fatal:
                      raise
                  else:
                      continue
      
              with open(raw_dir + item[2] + '.json', "wt") as f:
                  f.write(json.dumps(dict(response)) + "\n")
      
          return
      
      
      def parse_tweet_json(filename):
      
          # read tweet
          fp = open(filename, 'r')
      
          # parse json
          try:
              tweet_json = json.load(fp)
          except ValueError as e:
              print(e)
              raise RuntimeError('error parsing json')
      
          # look for twitter api error msgs
          if 'error' in tweet_json or 'errors' in tweet_json:
              raise RuntimeError('error in downloaded tweet')
      
          # extract creation date and tweet text
          return [tweet_json['created_at'], tweet_json['text']]
      
      
      def build_output_corpus(out_filename, raw_dir, total_list):
      
          # open csv output file
          fp = open(out_filename, 'wb')
          writer = csv.writer(fp, delimiter=',', quotechar='"', escapechar='\\',
                              quoting=csv.QUOTE_ALL)
      
          # write header row
          writer.writerow(
              ['Topic', 'Sentiment', 'TweetId', 'TweetDate', 'TweetText'])
      
          # parse all downloaded tweets
          missing_count = 0
          for item in total_list:
      
              # ensure tweet exists
              if os.path.exists(raw_dir + item[2] + '.json'):
      
                  try:
                      # parse tweet
                      parsed_tweet = parse_tweet_json(raw_dir + item[2] + '.json')
                      full_row = item + parsed_tweet
      
                      # character encoding for output
                      for i in range(0, len(full_row)):
                          full_row[i] = full_row[i].encode("utf-8")
      
                      # write csv row
                      writer.writerow(full_row)
      
                  except RuntimeError:
                      print('--> bad data in tweet #' + item[2])
                      missing_count += 1
      
              else:
                  print('--> missing tweet #' + item[2])
                  missing_count += 1
      
          # indicate success
          if missing_count == 0:
              print('\nSuccessfully downloaded corpus!')
              print('Output in: ' + out_filename + '\n')
          else:
              print('\nMissing %d of %d tweets!' % (missing_count, len(total_list)))
              print('Partial output in: ' + out_filename + '\n')
      
          return
      
      
      def main():
          # get user parameters
          user_params = get_user_params(DATA_PATH)
          print(user_params)
          dump_user_params(user_params)
      
          # get fetch list
          total_list = read_total_list(user_params['inList'])
      
          # remove already fetched or missing tweets
          fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
          print("Fetching %i tweets..." % len(fetch_list))
      
          if fetch_list:
              # start fetching data from twitter
              download_tweets(fetch_list, user_params['rawDir'])
      
              # second pass for any failed downloads
              fetch_list = purge_already_fetched(total_list, user_params['rawDir'])
              if fetch_list:
                  print('\nStarting second pass to retry %i failed downloads...' %
                        len(fetch_list))
                  download_tweets(fetch_list, user_params['rawDir'])
          else:
              print("Nothing to fetch any more.")
      
          # build output corpus
          build_output_corpus(user_params['outList'], user_params['rawDir'],
                              total_list)
      
      
      if __name__ == '__main__':
          main()
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2022-11-20
        • 2015-06-30
        • 2017-04-26
        • 2018-04-10
        • 1970-01-01
        • 1970-01-01
        • 2016-03-15
        相关资源
        最近更新 更多