import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]
print(train_df.columns)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
train_df.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
train_df.info()
print('_'*40)
test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_df.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
| count |
891.000000 |
891.000000 |
891.000000 |
714.000000 |
891.000000 |
891.000000 |
891.000000 |
| mean |
446.000000 |
0.383838 |
2.308642 |
29.699118 |
0.523008 |
0.381594 |
32.204208 |
| std |
257.353842 |
0.486592 |
0.836071 |
14.526497 |
1.102743 |
0.806057 |
49.693429 |
| min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
223.500000 |
0.000000 |
2.000000 |
20.125000 |
0.000000 |
0.000000 |
7.910400 |
| 50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
| 75% |
668.500000 |
1.000000 |
3.000000 |
38.000000 |
1.000000 |
0.000000 |
31.000000 |
| max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
train_df.describe(include='O')
|
Name |
Sex |
Ticket |
Cabin |
Embarked |
| count |
891 |
891 |
891 |
204 |
889 |
| unique |
891 |
2 |
681 |
147 |
3 |
| top |
Beckwith, Mrs. Richard Leonard (Sallie Monypeny) |
male |
1601 |
G6 |
S |
| freq |
1 |
577 |
7 |
4 |
644 |
train_df[['Pclass', 'Survived']].groupby(['Pclass'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
|
Pclass |
Survived |
| 0 |
1 |
0.629630 |
| 1 |
2 |
0.472826 |
| 2 |
3 |
0.242363 |
train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
|
Sex |
Survived |
| 0 |
female |
0.742038 |
| 1 |
male |
0.188908 |
train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
|
SibSp |
Survived |
| 1 |
1 |
0.535885 |
| 2 |
2 |
0.464286 |
| 0 |
0 |
0.345395 |
| 3 |
3 |
0.250000 |
| 4 |
4 |
0.166667 |
| 5 |
5 |
0.000000 |
| 6 |
8 |
0.000000 |
train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
|
Parch |
Survived |
| 3 |
3 |
0.600000 |
| 1 |
1 |
0.550847 |
| 2 |
2 |
0.500000 |
| 0 |
0 |
0.343658 |
| 5 |
5 |
0.200000 |
| 4 |
4 |
0.000000 |
| 6 |
6 |
0.000000 |
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)
<seaborn.axisgrid.FacetGrid at 0x1a08bea550>

grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x102855978>

grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:708: UserWarning: Using the pointplot function without specifying `hue_order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x1a145b2ba8>

grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x1a1457ecf8>

print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
Before (891, 12) (418, 11) (891, 12) (418, 11)
train_df = train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
combine = [train_df, test_df]
print("After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
After (891, 9) (418, 8) (891, 9) (418, 8)
for dataset in combine:
dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)
train_df.head()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
| 0 |
1 |
0 |
3 |
0 |
22.0 |
1 |
0 |
7.2500 |
S |
| 1 |
2 |
1 |
1 |
1 |
38.0 |
1 |
0 |
71.2833 |
C |
| 2 |
3 |
1 |
3 |
1 |
26.0 |
0 |
0 |
7.9250 |
S |
| 3 |
4 |
1 |
1 |
1 |
35.0 |
1 |
0 |
53.1000 |
S |
| 4 |
5 |
0 |
3 |
0 |
35.0 |
0 |
0 |
8.0500 |
S |
guess_ages = np.zeros((2,3))
guess_ages
array([[0., 0., 0.],
[0., 0., 0.]])
for dataset in combine:
for i in range(0, 2):
for j in range(0, 3):
guess_df = dataset[(dataset['Sex'] == i) & \
(dataset['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5
for i in range(0, 2):
for j in range(0, 3):
dataset.loc[(dataset.Age.isnull()) & (dataset.Sex ==i) & ( dataset.Pclass == j+1),\
['Age']] = guess_ages[i, j]
train_df.head()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
| 0 |
1 |
0 |
3 |
0 |
22.0 |
1 |
0 |
7.2500 |
S |
| 1 |
2 |
1 |
1 |
1 |
38.0 |
1 |
0 |
71.2833 |
C |
| 2 |
3 |
1 |
3 |
1 |
26.0 |
0 |
0 |
7.9250 |
S |
| 3 |
4 |
1 |
1 |
1 |
35.0 |
1 |
0 |
53.1000 |
S |
| 4 |
5 |
0 |
3 |
0 |
35.0 |
0 |
0 |
8.0500 |
S |
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False). \
mean().sort_values(by='AgeBand', ascending=True)
|
AgeBand |
Survived |
| 0 |
(0.34, 16.336] |
0.550000 |
| 1 |
(16.336, 32.252] |
0.336714 |
| 2 |
(32.252, 48.168] |
0.412844 |
| 3 |
(48.168, 64.084] |
0.434783 |
| 4 |
(64.084, 80.0] |
0.090909 |
for dataset in combine:
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
AgeBand |
| 0 |
1 |
0 |
3 |
0 |
1.0 |
1 |
0 |
7.2500 |
S |
(16.336, 32.252] |
| 1 |
2 |
1 |
1 |
1 |
2.0 |
1 |
0 |
71.2833 |
C |
(32.252, 48.168] |
| 2 |
3 |
1 |
3 |
1 |
1.0 |
0 |
0 |
7.9250 |
S |
(16.336, 32.252] |
| 3 |
4 |
1 |
1 |
1 |
2.0 |
1 |
0 |
53.1000 |
S |
(32.252, 48.168] |
| 4 |
5 |
0 |
3 |
0 |
2.0 |
0 |
0 |
8.0500 |
S |
(32.252, 48.168] |
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
| 0 |
1 |
0 |
3 |
0 |
1.0 |
1 |
0 |
7.2500 |
S |
| 1 |
2 |
1 |
1 |
1 |
2.0 |
1 |
0 |
71.2833 |
C |
| 2 |
3 |
1 |
3 |
1 |
1.0 |
0 |
0 |
7.9250 |
S |
| 3 |
4 |
1 |
1 |
1 |
2.0 |
1 |
0 |
53.1000 |
S |
| 4 |
5 |
0 |
3 |
0 |
2.0 |
0 |
0 |
8.0500 |
S |
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().\
sort_values(by='Survived', ascending=False)
|
Embarked |
Survived |
| 0 |
C |
0.553571 |
| 1 |
Q |
0.389610 |
| 2 |
S |
0.339009 |
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
train_df.head()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
| 0 |
1 |
0 |
3 |
0 |
1.0 |
1 |
0 |
7.2500 |
0 |
| 1 |
2 |
1 |
1 |
1 |
2.0 |
1 |
0 |
71.2833 |
1 |
| 2 |
3 |
1 |
3 |
1 |
1.0 |
0 |
0 |
7.9250 |
0 |
| 3 |
4 |
1 |
1 |
1 |
2.0 |
1 |
0 |
53.1000 |
0 |
| 4 |
5 |
0 |
3 |
0 |
2.0 |
0 |
0 |
8.0500 |
0 |
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().\
sort_values(by='FareBand', ascending=True)
|
FareBand |
Survived |
| 0 |
(-0.001, 7.91] |
0.197309 |
| 1 |
(7.91, 14.454] |
0.303571 |
| 2 |
(14.454, 31.0] |
0.454955 |
| 3 |
(31.0, 512.329] |
0.581081 |
for dataset in combine:
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
train_df.head(10)
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
| 0 |
1 |
0 |
3 |
0 |
1.0 |
1 |
0 |
0 |
0 |
| 1 |
2 |
1 |
1 |
1 |
2.0 |
1 |
0 |
3 |
1 |
| 2 |
3 |
1 |
3 |
1 |
1.0 |
0 |
0 |
1 |
0 |
| 3 |
4 |
1 |
1 |
1 |
2.0 |
1 |
0 |
3 |
0 |
| 4 |
5 |
0 |
3 |
0 |
2.0 |
0 |
0 |
1 |
0 |
| 5 |
6 |
0 |
3 |
0 |
1.0 |
0 |
0 |
1 |
2 |
| 6 |
7 |
0 |
1 |
0 |
3.0 |
0 |
0 |
3 |
0 |
| 7 |
8 |
0 |
3 |
0 |
0.0 |
3 |
1 |
2 |
0 |
| 8 |
9 |
1 |
3 |
1 |
1.0 |
0 |
2 |
1 |
0 |
| 9 |
10 |
1 |
2 |
1 |
0.0 |
1 |
0 |
2 |
1 |