数据预处理

import pandas as pd
titanic = pd.read_csv('./data/titanic_train.csv')
titanic.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
titanic.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

缺失项填充和字符数据映射成数字数据

titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
print(titanic['Sex'].unique())
['male' 'female']
titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1
titanic.describe()
PassengerId Survived Pclass Sex Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 0.352413 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 0.477990 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 0.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 0.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 1.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 1.000000 80.000000 8.000000 6.000000 512.329200
print(titanic['Embarked'].unique())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic['Embarked']=='S', 'Embarked'] = 0
titanic.loc[titanic['Embarked']=='C', 'Embarked'] = 1
titanic.loc[titanic['Embarked']=='Q', 'Embarked'] = 2
print(titanic['Embarked'].unique())
['S' 'C' 'Q' nan]
[0 1 2]

使用线性回归算法实现

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = LinearRegression()
fold = KFold(n_splits=3, shuffle=False).split(titanic[predictors])

predictions = []
for train, test in fold:
    train_predictors = (titanic[predictors].iloc[train,:])
    train_target = titanic['Survived'].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
import numpy as np

predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = len(predictions[predictions == titanic['Survived']]) / len(predictions)
print(len(predictions[predictions == titanic['Survived']]))
print(len(predictions))
print(accuracy)
698
891
0.7833894500561167

逻辑回归算法实现

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression()
scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
print(scores.mean())
0.7878787878787877


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

使用随机森林算法

titanic_test = pd.read_csv('./data/test.csv')
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())

titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0 
titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1

titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0
titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1
titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7856341189674523
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.8159371492704826

再次进行特征处理

titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
# .apply方法生成一个新的series
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))
import re

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

titles = titanic['Name'].apply(get_title)
print(pd.value_counts(titles))

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

print(pd.value_counts(titles))
titanic['Title'] = titles
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Lady          1
Mme           1
Countess      1
Jonkheer      1
Sir           1
Don           1
Ms            1
Capt          1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title', 'NameLength']
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])
#得到原始的pvalues并转换
scores = -np.log10(selector.pvalues_)

plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation = 'vertical')
plt.show()

4-Titanic船员生存实战

选出最佳的四个特征

predictors = ['Pclass', 'Sex', 'Title', 'NameLength']
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=4)
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7946331052664617

使用集成算法

from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
    
predictions = []
for train, test in kf:
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train,:], train_target)
        # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)
    
predictions = np.concatenate(predictions, axis=0)
accuracy = len(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
0.8226711560044894


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

相关文章:

  • 2021-10-29
  • 2021-08-12
  • 2021-12-04
  • 2022-12-23
  • 2021-10-12
  • 2022-12-23
  • 2022-01-19
  • 2021-09-27
猜你喜欢
  • 2022-12-23
  • 2021-08-10
  • 2021-12-16
  • 2022-12-23
  • 2021-12-10
  • 2021-05-12
  • 2022-12-23
相关资源
相似解决方案