数据预处理
import pandas as pd
titanic = pd.read_csv('./data/titanic_train.csv')
titanic.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
titanic.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
| count |
891.000000 |
891.000000 |
891.000000 |
714.000000 |
891.000000 |
891.000000 |
891.000000 |
| mean |
446.000000 |
0.383838 |
2.308642 |
29.699118 |
0.523008 |
0.381594 |
32.204208 |
| std |
257.353842 |
0.486592 |
0.836071 |
14.526497 |
1.102743 |
0.806057 |
49.693429 |
| min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
223.500000 |
0.000000 |
2.000000 |
20.125000 |
0.000000 |
0.000000 |
7.910400 |
| 50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
| 75% |
668.500000 |
1.000000 |
3.000000 |
38.000000 |
1.000000 |
0.000000 |
31.000000 |
| max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
缺失项填充和字符数据映射成数字数据
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
| count |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
| mean |
446.000000 |
0.383838 |
2.308642 |
29.361582 |
0.523008 |
0.381594 |
32.204208 |
| std |
257.353842 |
0.486592 |
0.836071 |
13.019697 |
1.102743 |
0.806057 |
49.693429 |
| min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
223.500000 |
0.000000 |
2.000000 |
22.000000 |
0.000000 |
0.000000 |
7.910400 |
| 50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
| 75% |
668.500000 |
1.000000 |
3.000000 |
35.000000 |
1.000000 |
0.000000 |
31.000000 |
| max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
print(titanic['Sex'].unique())
['male' 'female']
titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1
titanic.describe()
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
| count |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
| mean |
446.000000 |
0.383838 |
2.308642 |
0.352413 |
29.361582 |
0.523008 |
0.381594 |
32.204208 |
| std |
257.353842 |
0.486592 |
0.836071 |
0.477990 |
13.019697 |
1.102743 |
0.806057 |
49.693429 |
| min |
1.000000 |
0.000000 |
1.000000 |
0.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
223.500000 |
0.000000 |
2.000000 |
0.000000 |
22.000000 |
0.000000 |
0.000000 |
7.910400 |
| 50% |
446.000000 |
0.000000 |
3.000000 |
0.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
| 75% |
668.500000 |
1.000000 |
3.000000 |
1.000000 |
35.000000 |
1.000000 |
0.000000 |
31.000000 |
| max |
891.000000 |
1.000000 |
3.000000 |
1.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
print(titanic['Embarked'].unique())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic['Embarked']=='S', 'Embarked'] = 0
titanic.loc[titanic['Embarked']=='C', 'Embarked'] = 1
titanic.loc[titanic['Embarked']=='Q', 'Embarked'] = 2
print(titanic['Embarked'].unique())
['S' 'C' 'Q' nan]
[0 1 2]
使用线性回归算法实现
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = LinearRegression()
fold = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
predictions = []
for train, test in fold:
train_predictors = (titanic[predictors].iloc[train,:])
train_target = titanic['Survived'].iloc[train]
alg.fit(train_predictors, train_target)
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = len(predictions[predictions == titanic['Survived']]) / len(predictions)
print(len(predictions[predictions == titanic['Survived']]))
print(len(predictions))
print(accuracy)
698
891
0.7833894500561167
逻辑回归算法实现
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression()
scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
print(scores.mean())
0.7878787878787877
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
使用随机森林算法
titanic_test = pd.read_csv('./data/test.csv')
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())
titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0
titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1
titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0
titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1
titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7856341189674523
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.8159371492704826
再次进行特征处理
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))
import re
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ''
titles = titanic['Name'].apply(get_title)
print(pd.value_counts(titles))
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
titles[titles == k] = v
print(pd.value_counts(titles))
titanic['Title'] = titles
Mr 517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6
Col 2
Major 2
Mlle 2
Lady 1
Mme 1
Countess 1
Jonkheer 1
Sir 1
Don 1
Ms 1
Capt 1
Name: Name, dtype: int64
1 517
2 183
3 125
4 40
5 7
6 6
7 5
10 3
8 3
9 2
Name: Name, dtype: int64
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title', 'NameLength']
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])
scores = -np.log10(selector.pvalues_)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation = 'vertical')
plt.show()

选出最佳的四个特征
predictors = ['Pclass', 'Sex', 'Title', 'NameLength']
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=4)
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7946331052664617
使用集成算法
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
[LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
predictions = []
for train, test in kf:
train_target = titanic['Survived'].iloc[train]
full_test_predictions = []
for alg, predictors in algorithms:
alg.fit(titanic[predictors].iloc[train,:], train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
test_predictions[test_predictions <= .5] = 0
test_predictions[test_predictions > .5] = 1
predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
accuracy = len(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
0.8226711560044894
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)