import pandas as pd
from datetime import date, timedelta
today = pd.Timestamp.today()
# note that the 8 and 7 for china and america are swapped for testing
df = pd.DataFrame([['china',today,1,4,8],
['america',today,2,5,7],
['china',date.today() - timedelta(days=1),3,6,9],
['india',date.today() - timedelta(days=2),4,7,10]],
columns=['country','date', 'a','b','c'])
# find the daily max: 1 line of fast code compared to 7 lines of a for-loop
daily_max = df.groupby('date', as_index=False)[['a', 'b', 'c']].max()
# add column with daily_max
daily_max['country'] = 'daily max'
# combine with df
df_updated = pd.concat([df, daily_max]).sort_values(['date', 'country']).reset_index(drop=True)
# display(df_updated)
country date a b c
0 daily max 2020-09-06 00:00:00.000000 4 7 10
1 india 2020-09-06 00:00:00.000000 4 7 10
2 china 2020-09-07 00:00:00.000000 3 6 9
3 daily max 2020-09-07 00:00:00.000000 3 6 9
4 america 2020-09-08 14:38:20.382794 2 5 7
5 china 2020-09-08 14:38:20.382794 1 4 8
6 daily max 2020-09-08 14:38:20.382794 2 5 8
- 另一种方法是添加一列布尔值来选择每日最大值。
- 这将使单个滑块最多可用于
a、b 或c。
- 类似地,使用
groupby,也使用.transform 来保持相同的数据框轴。
- 如果有一个指标,其中一整天为 0,因此没有计数值,那么当天的整列将是
True,因为 0 是最大值。
import pandas as pd
from datetime import date, timedelta
today = pd.Timestamp.today()
# note that the 8 and 7 for china and america are swapped for testing
df = pd.DataFrame([['china',today,1,4,8],
['america',today,2,5,7],
['china',date.today() - timedelta(days=1),3,6,9],
['india',date.today() - timedelta(days=2),4,7,10]],
columns=['country','date', 'a','b','c'])
# add columns using groupby and transform
df[['max_a', 'max_b', 'max_c']] = df.groupby('date')[['a', 'b', 'c']].transform('max') == df[['a', 'b', 'c']]
# display(df)
country date a b c max_a max_b max_c
0 china 2020-09-08 13:14:25.713340 1 4 8 False False True
1 america 2020-09-08 13:14:25.713340 2 5 7 True True False
2 china 2020-09-07 00:00:00.000000 3 6 9 True True True
3 india 2020-09-06 00:00:00.000000 4 7 10 True True True
真实 COVID 数据示例
import pandas as pd
# load first 6 columns of data and parse dates
df = pd.read_csv('https://raw.githubusercontent.com/trenton3983/stack_overflow/master/data/so_data/2020-09-08%2063800602/covid_data.csv', parse_dates=['date'], usecols=range(6))
# remove World from location, because this is the sum for each day and will always be the max
df = df[df.location != 'World']
# get last four columns, because I'm to lazy to type them
cols = df.columns[-4:]
# find the daily max: 1 line of fast code compared to 7 lines of a for-loop
daily_max = df.groupby('date', as_index=False)[cols].max()
# add column with daily_max
daily_max['location'] = 'daily max'
# combine with df
df_updated = pd.concat([df, daily_max]).sort_values(['date', 'location']).reset_index(drop=True)
显示2020-07-04的尾巴
df_updated[df_updated.date == '2020-07-04'].tail(15)
date location new_cases new_deaths total_cases total_deaths
28124 2020-07-04 Ukraine 876.0 27.0 46763.0 1212.0
28125 2020-07-04 United Arab Emirates 672.0 1.0 50141.0 318.0
28126 2020-07-04 United Kingdom 602.0 49.0 286141.0 40581.0
28127 2020-07-04 United States 54442.0 694.0 2794321.0 129434.0
28128 2020-07-04 United States Virgin Islands 13.0 0.0 111.0 6.0
28129 2020-07-04 Uruguay 5.0 0.0 952.0 28.0
28130 2020-07-04 Uzbekistan 301.0 2.0 9500.0 29.0
28131 2020-07-04 Vatican 0.0 0.0 12.0 0.0
28132 2020-07-04 Venezuela 264.0 2.0 6537.0 59.0
28133 2020-07-04 Vietnam 0.0 0.0 355.0 0.0
28134 2020-07-04 Western Sahara 58.0 0.0 519.0 1.0
28135 2020-07-04 Yemen 19.0 10.0 1240.0 335.0
28136 2020-07-04 Zambia 0.0 0.0 1632.0 30.0
28137 2020-07-04 Zimbabwe 8.0 0.0 625.0 7.0
28138 2020-07-04 daily max 54442.0 1290.0 2794321.0 129434.0
两种方法的示例输出
date location new_cases new_deaths total_cases total_deaths max new_cases max new_deaths max total_cases max total_deaths
28124 2020-07-04 Ukraine 876.0 27.0 46763.0 1212.0 False False False False
28125 2020-07-04 United Arab Emirates 672.0 1.0 50141.0 318.0 False False False False
28126 2020-07-04 United Kingdom 602.0 49.0 286141.0 40581.0 False False False False
28127 2020-07-04 United States 54442.0 694.0 2794321.0 129434.0 True False True True
28128 2020-07-04 United States Virgin Islands 13.0 0.0 111.0 6.0 False False False False
28129 2020-07-04 Uruguay 5.0 0.0 952.0 28.0 False False False False
28130 2020-07-04 Uzbekistan 301.0 2.0 9500.0 29.0 False False False False
28131 2020-07-04 Vatican 0.0 0.0 12.0 0.0 False False False False
28132 2020-07-04 Venezuela 264.0 2.0 6537.0 59.0 False False False False
28133 2020-07-04 Vietnam 0.0 0.0 355.0 0.0 False False False False
28134 2020-07-04 Western Sahara 58.0 0.0 519.0 1.0 False False False False
28135 2020-07-04 Yemen 19.0 10.0 1240.0 335.0 False False False False
28136 2020-07-04 Zambia 0.0 0.0 1632.0 30.0 False False False False
28137 2020-07-04 Zimbabwe 8.0 0.0 625.0 7.0 False False False False
28138 2020-07-04 daily max 54442.0 1290.0 2794321.0 129434.0 NaN NaN NaN NaN