Data analysis case 0001 - Titanic

Welcome to Hua Sheng Blog! This is a data analysis case 0001. original source: kaggle. Most of the content in this article comes from it, Good Luck!

数据预处理 & 特征工程

数据预处理

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import (SimpleImputer, IterativeImputer)
from sklearn.preprocessing import (OneHotEncoder, StandardScaler)
from sklearn.model_selection import (GridSearchCV, cross_val_score)
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestRegressor
full_df = pd.read_csv('../数据源/titanic/train.csv')
test_df = pd.read_csv("../数据源/titanic/test.csv")
full_df.head()

数据预览

# Separate test_df PassengerId (will need it for submission)
test_pass_id = test_df.pop('PassengerId')

# Keep max index that will be used to back split training and test data
X_max_index = full_df.shape[0]

# Separate features and target
y = full_df.Survived

df = full_df.drop(['Survived', 'PassengerId'], axis=1)
df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

full_df.corr()["Survived"].sort_values()

df.hist(figsize=(10,10))

数据分布

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams["axes.unicode_minus"] = False
fig = plt.figure(figsize=(18,18))
fig.set(alpha=0.2)  # 设定图表颜色alpha参数


plt.subplot2grid((2,3),(0,0))             # 在一张大图里分列几个小图
full_df.Survived.value_counts().plot(kind='bar')# 柱状图 
plt.title(u"获救情况") # 标题
plt.ylabel(u"人数") 
plt.xticks(np.arange(2), ['死亡','获救'],rotation=0)

plt.subplot2grid((2,3),(0,1))
full_df.Pclass.value_counts().plot(kind="bar")
plt.ylabel(u"人数")
plt.title(u"乘客等级分布")
plt.xticks(np.arange(3), ['三等仓','一等仓','二等舱'],rotation=0)

plt.subplot2grid((2,3),(0,2))
plt.scatter(full_df.Survived, full_df.Age)
plt.ylabel(u"年龄")                         # 设定纵坐标名称
plt.grid(b=True, which='major', axis='y') 
plt.title(u"按年龄看获救分布 (1为获救)")


plt.subplot2grid((2,3),(1,0), colspan=2)
full_df.Age[full_df.Pclass == 1].plot(kind='kde')   
full_df.Age[full_df.Pclass == 2].plot(kind='kde')
full_df.Age[full_df.Pclass == 3].plot(kind='kde')
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度") 
plt.title(u"各等级的乘客年龄分布")
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.


plt.subplot2grid((2,3),(1,2))
full_df.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")  
plt.show()

数据相关性

df['Lastname'] = df.Name.str.split(', ').str[0]

# Extracting the Title from Name column
df['Title'] = df.Name.str.split(', ').str[1]
df['Title'] = df.Title.str.split('.').str[0]

df.Title.value_counts()

# Grouping the same type titles 

# We change also Miss to Mrs, but later we will convert back 
# to Miss just for underage females as for now Miss is not 
# very usefull as it represents a young lady and also 
# an unmarried adult one of any age


females = ['Ms', 'Miss', 'Mlle', 'Mrs', 'Mme']
df.loc[df.Title.isin(females), 'Title'] = 'Mrs'

males = ['Master', 'Mr']
df.loc[(df.Title.isin(males)), 'Title'] = 'Mr'

# Change the titles for underage persons to Master and Miss
df.loc[((df.Title == 'Mr') & (df.Age < 18)), 'Title'] = 'Master'
df.loc[((df.Title == 'Mrs') & (df.Age < 18)), 'Title'] = 'Miss'

# Create noble title
df.loc[(~df.Title.isin(females) & ~df.Title.isin(males)), 'Title'] = 'Noble'

# Split Ticket by series and number
df['Ticket_series'] = [i[0] if len(i) > 1 else 0 for i in df.Ticket.str.split()]
df['Ticket_nr'] = [i[-1] for i in df.Ticket.str.split()]

# Create a column with the passengers number by ticket 
ticket_dict = df.groupby('Ticket_nr').Lastname.count().to_dict()

df['Passengers_ticket'] = df.Ticket_nr.map(ticket_dict)

# Create Price column
df['Price'] = (df.Fare / df.Passengers_ticket).round()

# Function for imputing Deck
def impute_deck_by(feature):
    for pclass in range(1, 4):
        # Create a mapping dictionary
        map_dic = (df[~df.Deck.isna() 
                      & (df.Pclass == pclass)]
                       .groupby(feature)
                       .Deck.unique()
                       .apply(list).to_dict())

        # Keep just the keys with a single deck to avoid 
        # the same key on different decks
        map_dic = {i:j[0] for i, j in map_dic.items() 
                   if len(j) == 1}

        # Imputing Deck from map_dic
        df.loc[df.Deck.isna() & (df.Pclass == pclass), 
               'Deck'] = df[feature].map(map_dic)

    # Check how many missing values we have at this step
    print(df.Deck.isna().sum())

# Analyze Price by Deck and Pclass
df.groupby(['Pclass', 'Deck']).Price.describe()

df.loc[df.Deck == 'T', 'Deck'] = 'A'

df[df.Deck == 'B'].sort_values('Price').head()

for i in [1,2,3]:
    for j in df[(df["Fare"]==0) & (df["Pclass"] == i)]["Price"].index:
        df.loc[j,"Price"] = df[(df["Fare"]!=0) & (df["Pclass"] == i) ]["Price"].mean()

df.loc[(df.Ticket_nr == '695'), 'Price'] = 19

class_deck_price = pd.DataFrame(df.groupby(['Pclass', 'Deck'])
                                .Price.mean().round(2)).reset_index()

# Impute missing prices 
# Where Deck is missing we will use the mean price by Pclass only
for index, row in df.loc[df.Price.isna(), 
                         ['Pclass', 'Deck']].iterrows():
    if not pd.isna(row.Deck):
        new_price = class_deck_price.loc[
            ((class_deck_price.Pclass == row.Pclass) 
            & (class_deck_price.Deck == row.Deck)), 'Price'].mean()
    else:
        new_price = class_deck_price[
            class_deck_price.Pclass == row.Pclass].Price.mean()

    df.loc[[index], 'Price'] = new_price

# Create dictionaries with aproximative price ranges by deck 
# concluded from previous analisys
first_cl = {'A': [25, 30],
            'B': [35, 70],
            'C': [30, 35],
            'D': [19, 25],
            'E': [9, 19]}

second_cl = {'D': [13, 17],
             'E': [5, 9],
             'F': [9, 13]}

third_cl = {'E': [8, 9],
            'F': [9, 21],
            'G': [0, 8]}

# Create a dictionary pairing Pclass and respective price dictionary
class_dict = {1: first_cl,
              2: second_cl,
              3: third_cl}

# Impute missing Deck values 
for index, row in df.loc[df.Deck.isna(), ['Pclass', 'Price']].iterrows():
    for c, d in class_dict.items():
        if row.Pclass == c:
            for i, j in d.items():
                if max(j) > row.Price >= min(j):
                    df.loc[[index], 'Deck'] = i

# Encode Deck with it's deck level number counting from the bottom
deck_level = {'G': 1, 'F': 2, 'E': 3, 'D': 4, 'C': 5, 'B': 6, 'A': 7}

df.Deck = df.Deck.replace(deck_level)

# Analyse how many people were on each deck.
# Many values were imputed with aproximation,but at least we will have 
# an aproximative crowd mass each passenger has to pass going up
deck_people = df.Deck.value_counts().sort_index().to_dict()

# Create an escape density dictionary from which we will impute data to our new feature
escape_density = {}
for i in range(1, 8):
    escape_density[i] = sum(deck_people.values())
    del deck_people[i]

# Create Escape_density column
df['Escape_density'] = df.Deck.replace(escape_density)

# We add together the person and his SibSp and Parch
df['Family_size'] = 1 + df.SibSp + df.Parch

# Create full data frame for analysis
X = df[:X_max_index]
test_df = df[X_max_index:].copy()
full_df = pd.concat([X, y], axis=1).copy()

# Check for families that has survivers and create a dictionary with mean value of their family survivability
family_survivers = full_df[['Lastname', 'Survived']].groupby('Lastname').mean().round(2).reset_index()
family_survivers_dict = dict(zip(family_survivers.Lastname, family_survivers.Survived))

# Reduce the dictionary to the list of families that are both in train and test data
common_survivers = {}
for lastname, survived in family_survivers_dict.items():
    if lastname in list(test_df['Lastname'].unique()):
        common_survivers[lastname] = survived

# Create Family_survivers feature
test_df['Family_survivers'] = test_df.Lastname.map(common_survivers)
full_df['Family_survivers'] = full_df.Lastname.map(common_survivers)

# For the families that are not present in both train and test we will impute the overall mean value
test_df.Family_survivers = test_df.Family_survivers.fillna(test_df.Family_survivers.mean())
full_df.Family_survivers = full_df.Family_survivers.fillna(full_df.Family_survivers.mean())

# Separate back features and target
y = full_df.Survived

df = full_df.drop('Survived', axis=1)
df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

# Change Pclass dtype to category as it's a classification feature
df.Pclass = df.Pclass.astype('category')

col_drop = ['Name', 'Ticket', 'Fare', 'Cabin', 'Lastname','Ticket_nr',  
            'Ticket_series', 'Passengers_ticket']
df = df.drop(col_drop, axis=1)

建模前的数据展示

模型构建前分析

# List of categorical columns
categ_cols = list(df.select_dtypes(['object', 'category']).columns)

# Impute categoricals with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')

df_cat = cat_imputer.fit_transform(df[categ_cols])
df_cat = pd.DataFrame(df_cat, columns=df[categ_cols].columns)

# Encode categoricals with One Hot Encoding
ohe = OneHotEncoder(sparse=False)

df_cat = pd.DataFrame(ohe.fit_transform(df_cat),
                  columns=ohe.get_feature_names_out())

# List of numerical columns
num_cols = [col for col in df.columns 
            if df[col].dtype in ['int64', 'float64']]

# Impute numericals
it_imp = IterativeImputer()

df_num = pd.DataFrame(it_imp.fit_transform(df[num_cols]),
                      columns=df[num_cols].columns)

# Concatenate with encoded categorical columns
df = pd.concat([df_cat, df_num], axis=1)

df["mother"] = " "
df.loc[(df["Age"] >= 32)& (df['Sex_female'] == 1) & (df['Parch'] > 1),"mother"] = "mother"
dummies_mother = pd.get_dummies(df['mother'], prefix= 'mother')
df = pd.concat([df,dummies_mother], axis=1)

# Create a full data frame for analysis
X = df[:X_max_index]
full_df = pd.concat([X, y], axis=1)

def survive_chance_by(feature, xticks=None, xlim=None):
    survived = full_df[full_df.Survived == 1]
    not_survived = full_df[full_df.Survived == 0]

    plt.figure(figsize=(10, 5))

    survived[feature].plot(kind='kde', label='survived')
    not_survived[feature].plot(kind='kde', label='not_survived')
    
    plt.xlim(xlim)
    plt.xticks(xticks)
    plt.legend()
    plt.grid()
    plt.xlabel(feature)
    plt.show()

survive_chance_by('Age', np.arange(0, 81, 5), (0, 80))

df['Age_group'] = pd.cut(x=df.Age, bins=[0, 15, 32, 44, df.Age.max()],
                         labels=['Child', 'Young', 'Adult', 'Old'])

survive_chance_by('Family_size', np.arange(0, 10, 1), (0, 10))

# Survivers by Family_survivers
survive_chance_by('Family_survivers', np.arange(0, 1.5, 0.1), (0, 1.5))

df['Lucky_family'] = pd.cut(x=df.Family_survivers, 
                            bins=[0, 0.22, 0.35, 0.49, df.Family_survivers.max()],
                            labels=['Low', 'Medium', 'Very_low', 'High'])
# Encode categoricals
df = pd.get_dummies(df)

# Apply np.log to normalize the skewed right Price
df.Price = df.Price.apply(np.log1p)

# Standardize 
std_scaler = StandardScaler()

df_scaled = std_scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns=df.columns)

# Drop features not used for modeling
cols_to_drop = ['Family_survivers', 'SibSp', 'Parch', 'Family_size']
df = df.drop(cols_to_drop, axis=1)

X = df[:X_max_index]
test_df = df[X_max_index:]

# Concatenate into a full dataset
full_df = pd.concat([X, y], axis=1)

correlation = full_df.corr()['Survived'].sort_values(ascending=False)

# Correlation graph
correlation[1:].plot(kind='bar', figsize=(10,5), title='Survivability dependency')
plt.show()

年龄与获救关系

家庭成员数与获救关系

家庭存货指数与获救关系

corr

模型训练

from sklearn.ensemble import RandomForestRegressor

from sklearn import ensemble

# fit到RandomForestRegressor之中
clf = rfc = ensemble.RandomForestClassifier(n_estimators=31
                             ,random_state=42
                             ,max_depth=8
                            ,min_samples_split=11
                            ,max_features=7)
clf.fit(X, y)
    
clf.score(X,y)

from sklearn.model_selection import cross_val_score
scorel = []
for i in range(0,300,10):
    rfc = ensemble.RandomForestClassifier(n_estimators=i+1,random_state=42)
    score = cross_val_score(rfc, X, y, cv=10).mean()
    scorel.append(score)
print(max(scorel),(scorel.index(max(scorel))*10)+1)
plt.figure(figsize=[20,5])
plt.plot(range(1,301,10),scorel)

from sklearn.model_selection import cross_val_score
scorel = []
for i in range(260,280,1):
    rfc = ensemble.RandomForestClassifier(n_estimators=i+1,random_state=42)
    score = cross_val_score(rfc, X, y, cv=10).mean()
    scorel.append(score)
print(max(scorel),(scorel.index(max(scorel)))+1)
plt.figure(figsize=[20,5])
plt.plot(range(260,280,1),scorel)

#调整max_depth - 
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':np.arange(1, 20, 1)}
# 一般根据数据的大小来进行一个试探，乳腺癌数据很小，所以可以采用1~10，或者1~20这样的试探
# 但对于像digit recognition那样的大型数据来说，我们应该尝试30~50层深度（或许还不足够 应该画出学习曲线，来观察深度对模型的影响
rfc = ensemble.RandomForestClassifier(n_estimators=270
                             ,random_state=42
                            )
GS = GridSearchCV(rfc,param_grid,cv=10)
GS.fit(X,y)
GS.best_score_ 

# 这边我们接着调了Min_samples_split 把模型往左边推移
param_grid = {'min_samples_split':np.arange(2, 2+20, 1)}
rfc = ensemble.RandomForestClassifier(n_estimators=270
                             ,random_state=42
                             ,max_depth=9
#                             ,min_samples_split=11
                            )
GS = GridSearchCV(rfc,param_grid,cv=10)
GS.fit(X,y)
GS.best_params_  # 显示调整出来的最佳参数

# 这边我们接着调了Min_samples_split 把模型往左边推移
param_grid = {'max_features':np.arange(3,30,1)} 
rfc = ensemble.RandomForestClassifier(n_estimators=270
                             ,random_state=42
                             ,max_depth=9
                            ,min_samples_split=17
#                             ,max_features=7
                            )
GS = GridSearchCV(rfc,param_grid,cv=10)
GS.fit(X,y)
GS.best_params_  # 显示调整出来的最佳参数
# 这个结果就是往左推会降低，因此要把模型往右边推

rfc = ensemble.RandomForestClassifier(n_estimators=270
                             ,random_state=42
                             ,max_depth=9
                            ,min_samples_split=17
                            ,max_features=16
                            )
rfc.fit(X,y)
rfc.score(X,y)

predictions = rfc.predict(test_df)
result = pd.DataFrame({'PassengerId':test_pass_id, 'Survived':predictions.astype(np.int32)})
result.head(10)