这里是部分解决方案;
reorder 是函数式的,但由于我对 pandas 不是很熟悉,所以我只是使用了 Python 的内置数据结构。
编辑:我已经用贪婪的实现替换了partition_by_sum;它试图找到相等的总和,但不注意每个 bin 的项目数。 对更好算法的建议?
这应该会给你一个很好的开端。
from collections import defaultdict
import csv
VALUE_COL = 1
NUM_BINS = 3
inp = [
["foo", 569],
["bar", 9842],
["asdasd", 98],
["poiqweu", 7840],
["oiasd", 4],
["poeri", 145],
["sacodiw", 55],
["aosdwr", 855],
["9523", 60],
["a52sd", 5500],
["sdcw", 415],
["0932", 317]
]
def load_csv(fname, **kwargs):
with open(fname, "rb") as inf:
for row in csv.reader(inf, **kwargs):
yield row
def save_csv(fname, rows, **kwargs):
with open(fname, "wb") as outf:
csv.writer(outf, **kwargs).writerows(rows)
def make_index(lst, col):
"""
Index a table by column;
return list of column-values and dict of lists of rows having that value
"""
values, index = [], defaultdict(list)
for row in lst:
val = row[col]
values.append(val)
index[val].append(row)
return values, index
def min_index(lst):
"""
Return index of min item in lst
"""
return lst.index(min(lst))
def partition_by_sum(values, num_bins, key=None):
"""
Try to partition values into lists having equal sum
Greedy algorithm, per http://en.wikipedia.org/wiki/Partition_problem#Approximation_algorithm_approaches
"""
values.sort(key=key, reverse=True) # sort descending
bins = [[] for i in xrange(num_bins)]
sums = [0] * num_bins
for value in values:
index = min_index(sums)
bins[index].append(value)
sums[index] += value
return bins
def reorder(lst, key=None):
"""
Return [highest, lowest, second-highest, second-lowest, ...]
"""
lst.sort(key=key, reverse=True) # sort in descending order
halflen = (len(lst) + 1) // 2 # find midpoint
highs, lows = lst[:halflen], lst[halflen:][::-1] # grab [high half descending], [low half ascending]
lst[0::2], lst[1::2] = highs, lows # reassemble
return lst
def main():
# load data
data = inp # load_csv("input_file.csv")
# solve partitioning
values, index = make_index(data, VALUE_COL)
bins = partition_by_sum(values, NUM_BINS)
# rearrange for output
bins = [[index[val].pop() for val in reorder(bin)] for bin in bins]
# write output
for i,bin in enumerate(bins, 1):
save_csv("output_file_{}.csv".format(i), bin)
if __name__=="__main__":
main()