不久前我做了类似的事情。我对 Pandas 和 Numpy 的配合以及坚持矢量化操作所产生的速度感到惊喜。
下面的示例不需要源文件以外的任何其他文件。根据您的需要修改表格。
from StringIO import StringIO
import pandas as pd
import numpy as np
src = """id1, id2, keyword, freq, gp1, gps2
222, 111, #paris, 100, loc1, loc2
444, 234, have, 1000, loc3, loc4
434, 134, #USA, 30, loc5, loc6
234, 234, she, 600, loc1, loc2
523, 5234,mobile, 900, loc3, loc4
"""
src_handle = StringIO(src)
blacklist_words = """
have she and did
""".split()
# Separate by comma and remove whitespace
table = pd.read_table(src_handle, sep=",\s*")
# You can create a single filter by straight-out comparison
filter_have = table["keyword"] == "have"
# Which you can use as a key directly
print table[filter_have]
# We'll solve this by building the filter you need and applying it.
def filter_on_blacklisted_words(keyword, blacklist_words, dataframe):
"""Filter a Pandas dataframe by removing any rows that has column {keyword}
in blacklist. Try to keep things vectorized for performance.
"""
# In the beginning, accept all values, and take the number of values from
# the dataframe we're using. Zeros is falsey.
blacklist_filter = np.zeros_like(dataframe[keyword])
for word in blacklist_words:
blacklist_filter = np.logical_or(blacklist_filter,
dataframe[keyword] == word)
return dataframe[np.logical_not(blacklist_filter)]
print filter_on_blacklisted_words("keyword", blacklist_words, table)