Python Pandas 根据另一列的总数从另一个数据帧中选择值答案

【问题标题】：Python Pandas Select values from another dataFrame depending on the total from another columnPython Pandas 根据另一列的总数从另一个数据帧中选择值
【发布时间】：2020-03-10 19:22:46
【问题描述】：

我在下面有一个 DataFrame，但我需要根据取消和订单列从每个代码中选择行。

说代码 xxx 有订单 [6, 1, 5, 1]，订单是 11。我需要一个算法，可以选择符合订单 [6 & 5] 的总共 11 行的行

如果没有行匹配，则选择最接近的 id 并将其添加到列表中，它们与取消的差异如下所示 111111 是选定的 id，35 是 55 和 20 之间的差异。我需要一个可以处理 10k 的算法行

我的预期输出

**code**    **canceled**    **order**              **ids**
     xxx           11.0            13     [128281, 128283]
     cvd             20            55         (111111, 35)

import pandas as pd

ccc = [
    {"code":"xxx","canceled":11.0,"id":"128281","order":6},
    {"code":"xxx","canceled":11.0,"id":"128282","order":1},
    {"code":"xxx","canceled":11.0,"id":"128283","order":5},
    {"code":"xxx","canceled":11.0,"id":"128284","order":1},
    {"code":"xxS","canceled":0.0,"id":"108664","order":4},
    {"code":"xxS","canceled":0.0,"id":"110515","order":1},
    {"code":"xxS","canceled":0.0,"id":"113556","order":1},
    {"code":"eeS","canceled":5.0,"id":"115236","order":1},
    {"code":"eeS","canceled":5.0,"id":"108586","order":1},
    {"code":"eeS","canceled":5.0,"id":"114107","order":1},
    {"code":"eeS","canceled":5.0,"id":"113472","order":3},
    {"code":"eeS","canceled":5.0,"id":"114109","order":3},
    {"code":"544W","canceled":44.0,"id":"107650","order":20},
    {"code":"544W","canceled":44.0,"id":"127763","order":4},
    {"code":"544W","canceled":44.0,"id":"128014","order":20},
    {"code":"544W","canceled":44.0,"id":"132434","order":58},
    {"code":"cvd","canceled":20.0,"id":"11111","order":55},
    {"code":"eeS","canceled":5.0,"id":"11111","order":5}
]

我尝试了以下解决方案，它可以工作，但如果它存在，我需要寻找确切的值。我还需要选择总和为取消值的最可能的 id。我想消除这个(111111, 35)

的可能性

df = pd.DataFrame(ccc)

def selected_ids(datum):
    ids = datum.id
    nbc = int(datum.canceled)
    order = datum.order
    count = []
    arr = []

    for loc, i in enumerate(order):

        count.append(i)
        arr.append(ids[loc])

        if nbc == int(i):
            return ids[loc]

        elif nbc  == 0:
            return ''

        elif nbc  < int(i):
            return (ids[loc], (int(i)-nbc))

        if nbc < sum(count):
             return [arr[:-1], (arr[-1],sum(count)-nbc)]

xcv = df.sort_values('order').groupby('code').agg({
    'code':'first',
    'canceled': 'first',
    'order': list,
    'id':list
})
xcv['Orders_to_cancel'] = xcv.apply(
    selected_ids, axis = 1
)
xcv

【问题讨论】：

您的问题解决了吗？否则，请考虑改写它。事实上，我认为这很难理解。可能是您应该创建一个玩具示例：使用 very simple 数据框 A，very simple 数据框B，以及预期的输出。

标签： python pandas sorting dataframe filter

【解决方案1】：

呢（为了便于阅读，数据框仅限于代码 xxx、cvd 和 eeS）

df2 = df.groupby('code').agg({
    'canceled' : 'first',
    'order'    : list,
    'id'       : list
}).reset_index().rename(
    columns={
        'id'    : 'ids',
        'order' : 'orders',
    }
)
df2['orders_sum'] = df2.orders.apply(sum)

print(df2)
###   code  canceled              orders                                              ids  orders_sum
### 0  cvd      20.0                [55]                                          [11111]          55
### 1  eeS       5.0  [1, 1, 1, 3, 3, 5]  [115236, 108586, 114107, 113472, 114109, 11111]          14
### 2  xxx      11.0        [6, 1, 5, 1]                 [128281, 128282, 128283, 128284]          13

然后我们可能首先要检查某些ids 是否已经有一个直接适合canceled 值的order 值。

df2['direct_ids'] = df2.apply(
    lambda r: [
        i for o, i in zip(r.orders, r.ids)
        if o == r.canceled
    ],
    axis = 1
)

print(df2.loc[:, ('code', 'canceled', 'direct_ids')])   
###   code  canceled direct_ids
### 0  cvd      20.0         []
### 1  eeS       5.0    [11111]    # We could have had more than one id, hence the list
### 2  xxx      11.0         []

...否则我们必须得到所有可能的ids的组合

import itertools as it
import pprint as pp

df2['ids_']   = df2.ids.apply(lambda l:l[:40])  # Let's make the bet that 40 ids will be enough to find the sum we want, avoiding memory error at the same time.
df2['combos'] = df2.ids_.apply(
    lambda l: list(it.chain.from_iterable(
        it.combinations(l, i + 1)
        for i in range(len(l))
    ))
)

pp.pprint(df2.combos[2]) # an illustration with the indexed-by-2 combinations (code `'xxx'`)
### [('128281',),
###  ('128282',),
###  ('128283',),
###  ('128284',),
###  ('128281', '128282'),
###  ('128281', '128283'),
###  ('128281', '128284'),
###  ('128282', '128283'),
###  ('128282', '128284'),
###  ('128283', '128284'),
###  ('128281', '128282', '128283'),
###  ('128281', '128282', '128284'),
###  ('128281', '128283', '128284'),
###  ('128282', '128283', '128284'),
###  ('128281', '128282', '128283', '128284')]

我们现在需要计算由这些组合产生的canceled 值和order-sum 之间的所有距离。

df2['distances'] = df2.apply(
    lambda r : {
        combo : abs(
            r.canceled - df.loc[
            df.code.isin([r.code]) & df.id.isin(combo),
            ('order',)
        ].sum()[0]) for combo in r.combos
    },
    axis = 1
)
pp.pprint(df2.distances[2])                                                                          
### {('128281',): 5.0,
###  ('128281', '128282'): 4.0,
###  ('128281', '128282', '128283'): 1.0,
###  ('128281', '128282', '128283', '128284'): 2.0,
###  ('128281', '128282', '128284'): 3.0,
###  ('128281', '128283'): 0.0,           #<--- this is the 'xxx'-combination we want
###  ('128281', '128283', '128284'): 1.0,
###  ('128281', '128284'): 4.0,
###  ('128282',): 10.0,
###  ('128282', '128283'): 5.0,
###  ('128282', '128283', '128284'): 4.0,
###  ('128282', '128284'): 9.0,
###  ('128283',): 6.0,
###  ('128283', '128284'): 5.0,
###  ('128284',): 10.0}

.. 我们现在可以分离出我们想要的确切组合

default_minv        = [float('inf')]
df2['min_distance'] = df2.distances.apply(
    lambda ds : min(ds.values() or default_minv) # to avoid errors when ds.values() is empty
)
df2['summed_ids'] = df2.apply(
    lambda r : [
        c for c, d in r.distances.items()
        if d == r.min_distance
    ],
    axis = 1
)

print(df2.loc[:, ('code', 'canceled', 'orders_sum', 'min_distance', 'summed_ids')])                     
###   code  canceled  orders_sum  min_distance                                         summed_ids
### 0  cvd      20.0          55          35.0                                         [(11111,)]
### 1  eeS       5.0          14           0.0  [(11111,), (115236, 108586, 113472), (115236, ...
### 2  xxx      11.0          13           0.0                                 [(128281, 128283)]

i) 正如您在上面看到的，我已将 min_distance 定义为不同的列，仅仅是因为在单个列中包含不同/多种类型的对象并不是一个好习惯，并且ii) 该方法是通用的，因此您可以将多个ids 组合为summed_ids，ie 如果其中许多具有相同的min_distance。 p>

[...] 我还需要选择与取消值相加的最可能的 id。

从此变得如此简单

cols_of_interest = ['code', 'canceled', 'orders_sum', 'direct_ids', 'summed_ids']
sub_df = df2.loc[
    (df2.min_distance==0) | df2.direct_ids.map(len), cols_of_interest
]
print(sub_df)                                                        
###   code  canceled  orders_sum direct_ids                                         summed_ids
### 1  eeS       5.0          14    [11111]  [(11111,), (115236, 108586, 113472), (115236, ...
### 2  xxx      11.0          13         []                                 [(128281, 128283)]

避免存储所有组合（无需像以前那样定义df2['combos']），您可以做的是：

df2['distances'] = df2.apply(
    lambda r : {
        combo : abs(
            r.canceled - df.loc[
            df.code.isin([r.code]) & df.id.isin(combo),
            ('order',)
        ].sum()[0]) for combo in it.chain.from_iterable(
            it.combinations(r.ids, i + 1)
            for i in range(len(r.ids))
        )
    },
    axis = 1
)

更新

因为我承认它开始变成code-golf，所以考虑下面的（整个）代码

import itertools as it

df2 = df.groupby('code').agg({
    'canceled' : 'first',
    'order'    : list,
    'id'       : list
}).reset_index().rename(
    columns={
        'id'    : 'ids',
        'order' : 'orders',
    }
)

df2['orders_sum'] = df2.orders.apply(sum)


df2['direct_ids'] = df2.apply(
    lambda r: [
        i for o, i in zip(r.orders, r.ids)
        if o == r.canceled
    ],
    axis = 1
)

def distances_computer(r):
    combos = it.chain.from_iterable(
        it.combinations(r.ids, i + 1)
        for i in range(len(r.ids))
    )
    distances_ = []
    for combo in combos:
        d = abs(
            r.canceled - df.loc[
            df.code.isin([r.code]) & df.id.isin(combo),
            ('order',)
        ].sum()[0])
        distances_.append((combo, d)) 
        if d == 0: # Iterations stops as soon as a zero-distance is found.
            break
    # Let's minimize the number of returned distances, keeping only the 10
    # smallest
    distances = sorted(distances_, key=lambda item:item[1])[:10] # Actually you may want to put `1` instead of `10`.
    return dict(distances)

df2['distances'] = df2.apply(
    distances_computer, axis = 1
)   


default_minv        = [float('inf')]
df2['min_distance'] = df2.distances.apply(
    lambda ds : min(ds.values() or default_minv) # to avoid errors when ds.values() is empty
)
df2['summed_ids'] = df2.apply(
    lambda r : [
        c for c, d in r.distances.items()
        if d == r.min_distance
    ],
    axis = 1
)

cols_of_interest = ['code', 'canceled', 'orders_sum', 'direct_ids', 'summed_ids']
sub_df = df2.loc[
    (df2.min_distance==0) | df2.direct_ids.map(len), cols_of_interest
]

【讨论】：

这在小型设备上效果很好，而且非常棒。在一个大片场上，它真的需要很长时间。我认为 itertools （组合）必须进行许多循环才能执行我需要的操作。它需要很长时间才能运行，但给出了我正在寻找的结果。有没有办法在没有 itertools 的情况下在最短的时间内获得相同的结果？
@Newtorn 您可能已经知道这一点，但通常会有长时间运行的进程。您想找到适合给定值的 id 的总和组合。当然，处理这种算法需要时间，其时间复杂度是不可约的。您当前的问题已得到解答，您至少可以将其标记为正确，因为它是正确的。我不能花几个小时在你的问题上。它不是我的代码平台，但我做到了。如果你不得不提出另一个问题，那就是：在 IPython 中处理 pandas 时，减少处理时间的最佳方法是什么？
见Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects。
嗨@KeepAlive 抱歉离开了，但是我正在使用您上次添加的更新脚本。我的行数超过 10k，感谢您为解决这个问题所做的努力。我也会看看你给我的链接。谢谢
我肯定把它放上来了。我会在这里评论链接。感谢您宝贵的时间和建议