Data Creation Process for ASNA 2019 Case Competition Technical Challenge

Original Dataset was found online, with a source no longer active It was later perturbed so that even if the dataset was still online, no useful data can be gathered for the purposes of winning the technical challenge

Please note that this is a rough version of the notebook written on a short notice. I will post the cleaned up version when I have the moment.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy import stats
#Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)
%matplotlib inline
In [2]:
# Import Data
initial_data = pd.read_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/initial_data.csv")
# See the first few rows
initial_data.head()
Out[2]:
Customer Country State Code State Claim Amount Response Coverage Education EmploymentStatus Gender ... Monthly Premium Auto Months Since Last Claim Months Since Policy Inception Number of Open Complaints Number of Policies Claim Reason Sales Channel Total Claim Amount Vehicle Class Vehicle Size
0 1 US KS Kansas 276.351928 No Basic Bachelor Employed F ... 69 32 5 0 1 Collision Agent 384.811147 Two-Door Car Medsize
1 2 US NE Nebraska 697.953590 No Extended Bachelor Unemployed F ... 94 13 42 0 8 Scratch/Dent Agent 1131.464935 Four-Door Car Medsize
2 3 US OK Oklahoma 1288.743165 No Premium Bachelor Employed F ... 108 18 38 0 2 Collision Agent 566.472247 Two-Door Car Medsize
3 4 US MO Missouri 764.586183 No Basic Bachelor Unemployed M ... 106 18 65 0 7 Collision Call Center 529.881344 SUV Medsize
4 5 US KS Kansas 281.369258 No Basic Bachelor Employed M ... 73 12 44 0 1 Collision Agent 138.130879 Four-Door Car Medsize

5 rows × 23 columns

In [3]:
# Drop the Country, State name, Response, and Total Claim Amount Columns
initial_data = initial_data.drop(axis =1, labels=["Country", "State", "Response", "Total Claim Amount"])
# Perturb the dataset slightly so that it can't be searched on the internet easily and the results won't match.
initial_data["Claim Amount"] += np.random.normal(loc=0, scale=np.sqrt(5))
# Claim values looked low so multiply them by 5 (doesn't change anything for the challenge)
initial_data["Claim Amount"] *= 5
In [4]:
# See column values, to create claim severity classes
print(initial_data["Claim Amount"].describe())
q25 = initial_data["Claim Amount"].quantile(0.25)
q50 = initial_data["Claim Amount"].median()
q75 = initial_data["Claim Amount"].quantile(0.75)

# Claim classes based on the quantiles of the original (this is significantly faster than computing the quantiles at each iteration)
def create_claim_class(row):
    if row["Claim Amount"] < q25:
        return 1
    elif row["Claim Amount"] < q50:
        return 2
    elif row["Claim Amount"] < q75:
        return 3
    else :
        return 4
     
count     9134.000000
mean      4020.630971
std       3435.483804
min        967.164571
25%       2015.286630
50%       2908.251832
75%       4499.244254
max      41680.851328
Name: Claim Amount, dtype: float64
In [5]:
initial_data["Claim Class"] = initial_data.apply(lambda row: create_claim_class(row), axis=1)
initial_data["Claim Class"].describe()
initial_data["Claim Class"].value_counts()
Out[5]:
3    2288
4    2284
1    2284
2    2278
Name: Claim Class, dtype: int64
In [6]:
# Define the one-hot encoder 
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

final_data, cat_cols = one_hot_encoder(initial_data, nan_as_category=False)
In [7]:
#define label encoder for the string data:
lb = preprocessing.LabelBinarizer()

#Gender has to be done separately because it has just 2 classes.
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# TODO: Automate this 
list_string_vars = ("State Code", "Coverage", "Education", "EmploymentStatus", "Location Code", "Marital Status", "Claim Reason", "Sales Channel", "Vehicle Class", "Vehicle Size")

final_data = initial_data.select_dtypes(include=numerics)
final_data = final_data.drop(axis=1, labels=["Claim Class"])
for var in list_string_vars:
    lb.fit(initial_data[var])
    encoding = lb.transform(initial_data[var])
    labels = lb.classes_
    #make it into a data frame:
    df = pd.DataFrame(data= encoding,
                     columns = labels)
    final_data = pd.concat([final_data, df], axis=1)
df_gender = pd.DataFrame(data=lb.fit_transform(initial_data["Gender"]),
                        columns=["Male"])

final_data = pd.concat([final_data, df_gender,initial_data["Claim Class"]], axis=1)
print(final_data.columns)

#Save the final Preprocessed Data
#final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/preprocessed_data.csv")
Index(['Customer', 'Claim Amount', 'Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'IA', 'KS', 'MO',
       'NE', 'OK', 'Basic', 'Extended', 'Premium', 'Bachelor', 'College',
       'Doctor', 'High School or Below', 'Master', 'Disabled', 'Employed',
       'Medical Leave', 'Retired', 'Unemployed', 'Rural', 'Suburban', 'Urban',
       'Divorced', 'Married', 'Single', 'Collision', 'Hail', 'Other',
       'Scratch/Dent', 'Agent', 'Branch', 'Call Center', 'Web',
       'Four-Door Car', 'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car',
       'Two-Door Car', 'Large', 'Medsize', 'Small', 'Male', 'Claim Class'],
      dtype='object')

Now I will create and train a few models that will be predicting the values of "Claim Class" column using all the others

Later I will be using those models to greate "true" values for the enhanced dataset.

Currently models:

  • Decision Tree
  • SVM
  • Random Forest
  • Adaboost
  • Gaussian Naive Bayes
  • Multilayer Perceptron
  • Quadratic Discriminant Analysis
  • Gradient Bossting Classifier
In [8]:
all_features = final_data.drop(axis=1, labels =["Customer", "Claim Amount"])
all_targets = all_features.pop("Claim Class")
train_features , test_features, train_targets, test_targets = train_test_split(all_features, all_targets, test_size=0.2)
In [9]:
# Initialize all models in a list
models = [DecisionTreeClassifier(max_depth=8),
          # SVC is overfitting like crazy, needs regularization
          SVC(C=1, decision_function_shape="ovo", verbose=True),
          RandomForestClassifier(max_depth=12, n_estimators=100),
          MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000,alpha=0.001,tol = 0.000000001, solver="lbfgs", verbose=True, activation = "tanh"),
          AdaBoostClassifier(n_estimators=200, learning_rate=0.05),
          GaussianNB(),
          QuadraticDiscriminantAnalysis(reg_param=0),
          GradientBoostingClassifier(verbose=True),
         ]
# Define all the model names
model_names = ["Decision Tree",
               "SVC",
               "Random Forest",
               "Neural Net",
               "Adaboost",
               "Gaussian Naive Bayes",
               "QDA",
               "Gradient Boosting Classifier"
              ]
# Print the data size
print("Training Data size: {}".format(train_features.shape))

# Loop over models instead of having separate cell per model
for name, model in zip(model_names, models):
    model.random_state = 0
    print("Training Model :  {}".format(name))
    model.fit(train_features, train_targets)
    print("Done Training {}".format(name))
    test_score = model.score(test_features, test_targets) * 100
    train_score = model.score(train_features, train_targets) * 100
    print("{} Train Score : {}".format(name, train_score))
    print("{} Test Score : {}".format(name, test_score))
    print("~"*50)
    print("Model Done")
    print("~"*50)

print("*"*50)    
print("~"*50)
print("ALL MODELS DONE")
print("~"*50)
print("*"*50)
Training Data size: (7307, 48)
Training Model :  Decision Tree
Done Training Decision Tree
Decision Tree Train Score : 94.9637334063227
Decision Tree Test Score : 91.40667761357416
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  SVC
[LibSVM]Done Training SVC
SVC Train Score : 97.15341453400849
SVC Test Score : 42.8024083196497
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Random Forest
Done Training Random Forest
Random Forest Train Score : 97.94717394279459
Random Forest Test Score : 92.77504105090311
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Neural Net
Done Training Neural Net
Neural Net Train Score : 42.38401532776789
Neural Net Test Score : 41.43404488232074
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Adaboost
Done Training Adaboost
Adaboost Train Score : 80.84029013274942
Adaboost Test Score : 81.33552271483306
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Gaussian Naive Bayes
Done Training Gaussian Naive Bayes
Gaussian Naive Bayes Train Score : 63.227042561926915
Gaussian Naive Bayes Test Score : 61.63108921729611
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  QDA
Done Training QDA
QDA Train Score : 50.82797317640618
QDA Test Score : 51.450465243568686
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Gradient Boosting Classifier
      Iter       Train Loss   Remaining Time 
         1        8685.3634            8.05s
         2        7597.4283            8.35s
         3        6739.1580            8.10s
         4        5940.5194            8.04s
         5        5289.7698            8.01s
         6        4759.1374            8.06s
         7        4310.9545            8.11s
         8        3969.5428            7.88s
         9        3645.7359            7.69s
        10        3394.6779            7.54s
        20        2062.2822            6.24s
        30        1558.1028            5.43s
        40        1314.1766            4.54s
        50        1169.6003            3.71s
        60        1076.2102            2.91s
        70        1005.6195            2.16s
        80         950.2442            1.42s
        90         901.5966            0.70s
       100         847.4721            0.00s
Done Training Gradient Boosting Classifier
Gradient Boosting Classifier Train Score : 96.16805802654989
Gradient Boosting Classifier Test Score : 93.10344827586206
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**************************************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ALL MODELS DONE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**************************************************

Now that we have our Models pre-trained it's time to generate some fake profiles. To account for correlations in socioeconomic data will split into socioeconomic and non, and sample from socioeconomic together. This makes the challenge a bit easier, since the socioeconomic data will almost necessarily be closer between trainand test sets, but inducing a correlation structure is not feasible in a short amount of time. I also split out car specific variables to avoid small sized SUV's and other problematic datapoints.

In [10]:
# Split out the socioeconomic indicators:
socioeconomic_columns = ["Education", "EmploymentStatus", "Income", "Marital Status"]
socioeconomic_data = initial_data[socioeconomic_columns]
car_columns = ["Vehicle Class", "Vehicle Size"]
car_data = initial_data[car_columns]

# Split other data out
nonsocioeconomic_data = initial_data.drop(socioeconomic_columns, axis = 1)
uncorr_data = nonsocioeconomic_data.drop(car_columns, axis = 1)
In [11]:
# Initialize the data frames for storing results
new_data_uncorr = pd.DataFrame()
new_data_socio = pd.DataFrame()
new_data_car = pd.DataFrame()
# Generate new uncorrelated data
for column in uncorr_data.columns:
    new_data_uncorr.reset_index(inplace=True,drop=True)
    to_join = uncorr_data[column].sample(50000, replace=True)
    to_join.reset_index(inplace=True,drop=True)
    new_data_uncorr = pd.concat([new_data_uncorr, to_join ],axis=1)
# Sample the socioeconomic data
new_data_socio = socioeconomic_data.sample(50000, replace=True)
# Sample the car data
new_data_car = car_data.sample(50000, replace = True)
# Reset inices for merging
new_data_socio.reset_index(inplace=True, drop=True)
new_data_uncorr.reset_index(inplace=True, drop=True)
new_data_car.reset_index(inplace=True, drop=True)
# Combine the data
new_data = pd.concat([new_data_socio, new_data_uncorr],  axis=1)
new_data = pd.concat([new_data, new_data_car], axis=1)
new_data = new_data.drop(axis=1, labels=["Customer", "Claim Amount", "Claim Class"])
new_data
Out[11]:
Education EmploymentStatus Income Marital Status State Code Coverage Gender Location Code Monthly Premium Auto Months Since Last Claim Months Since Policy Inception Number of Open Complaints Number of Policies Claim Reason Sales Channel Vehicle Class Vehicle Size
0 Doctor Employed 73853 Single MO Basic M Suburban 71 3 8 1 1 Other Agent SUV Small
1 Doctor Employed 66889 Married KS Basic F Suburban 83 3 4 0 1 Hail Agent Four-Door Car Medsize
2 College Employed 60462 Married NE Extended M Suburban 87 20 32 3 4 Hail Agent Four-Door Car Medsize
3 Bachelor Employed 97361 Single NE Basic F Suburban 266 4 93 0 2 Collision Agent Four-Door Car Medsize
4 College Unemployed 0 Single NE Basic F Rural 68 29 94 0 4 Hail Agent Four-Door Car Medsize
5 Bachelor Medical Leave 13037 Married MO Extended M Suburban 112 5 27 0 9 Hail Call Center Four-Door Car Large
6 High School or Below Employed 34510 Divorced IA Extended M Suburban 84 1 70 0 8 Hail Branch Four-Door Car Small
7 High School or Below Employed 51255 Married IA Basic F Urban 102 11 45 0 1 Collision Agent Two-Door Car Medsize
8 Bachelor Employed 30389 Married MO Extended F Rural 139 15 41 0 6 Hail Web Luxury SUV Medsize
9 Bachelor Employed 21947 Married MO Basic M Urban 133 2 30 0 1 Other Branch Four-Door Car Medsize
10 Bachelor Employed 49853 Single NE Extended M Suburban 68 31 44 4 2 Scratch/Dent Call Center SUV Medsize
11 Bachelor Unemployed 0 Married NE Premium F Suburban 77 3 47 0 1 Collision Branch SUV Medsize
12 Bachelor Employed 70432 Married OK Extended F Urban 134 2 79 0 1 Other Call Center Four-Door Car Medsize
13 College Employed 88082 Married MO Premium M Rural 136 13 36 0 8 Other Branch Four-Door Car Medsize
14 High School or Below Employed 22244 Married NE Extended F Suburban 67 31 25 0 1 Collision Agent SUV Medsize
15 High School or Below Employed 82320 Married IA Extended M Urban 64 23 60 1 1 Scratch/Dent Agent SUV Medsize
16 Bachelor Employed 24285 Divorced KS Basic F Suburban 64 9 30 1 1 Hail Call Center Four-Door Car Medsize
17 College Employed 67326 Married NE Extended M Suburban 72 25 27 0 2 Collision Branch SUV Medsize
18 Bachelor Unemployed 0 Single NE Basic M Rural 61 22 18 0 2 Hail Branch Two-Door Car Medsize
19 Bachelor Employed 68004 Single IA Basic M Suburban 62 2 8 0 1 Collision Agent Sports Car Large
20 Bachelor Employed 36088 Single IA Basic F Suburban 283 16 77 0 1 Collision Branch Four-Door Car Medsize
21 College Employed 20396 Married IA Extended F Suburban 138 33 37 0 3 Hail Agent Four-Door Car Medsize
22 Bachelor Employed 58781 Divorced MO Basic M Suburban 73 7 0 3 3 Hail Branch Four-Door Car Medsize
23 Bachelor Employed 92296 Married NE Extended F Suburban 63 24 97 0 1 Hail Call Center Luxury SUV Medsize
24 High School or Below Employed 50549 Divorced IA Basic M Suburban 74 35 48 0 9 Hail Call Center Four-Door Car Medsize
25 High School or Below Employed 48552 Married NE Basic M Urban 68 1 94 0 2 Scratch/Dent Branch Two-Door Car Medsize
26 College Employed 91541 Married MO Basic F Rural 63 32 10 1 2 Collision Agent Four-Door Car Large
27 Bachelor Employed 33190 Divorced OK Basic M Suburban 125 15 20 0 2 Collision Agent SUV Large
28 College Employed 52982 Divorced OK Extended F Suburban 119 34 49 0 1 Collision Agent Sports Car Medsize
29 Bachelor Unemployed 0 Single IA Basic F Suburban 68 6 76 0 9 Collision Branch Four-Door Car Small
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49970 High School or Below Employed 54586 Married NE Extended F Suburban 239 27 34 0 1 Collision Call Center Four-Door Car Medsize
49971 College Employed 69161 Married MO Premium F Suburban 73 1 23 0 8 Hail Web Two-Door Car Medsize
49972 College Employed 37937 Married NE Extended M Suburban 95 3 73 0 8 Hail Call Center SUV Medsize
49973 High School or Below Medical Leave 17120 Married MO Basic M Urban 137 19 42 1 3 Hail Branch Four-Door Car Medsize
49974 Master Unemployed 0 Married OK Basic F Urban 69 20 69 0 8 Hail Agent Four-Door Car Medsize
49975 Master Employed 88997 Divorced MO Basic M Suburban 63 3 14 3 2 Hail Agent Two-Door Car Medsize
49976 College Employed 47199 Married MO Basic M Rural 96 16 91 0 7 Collision Agent Four-Door Car Medsize
49977 High School or Below Employed 41520 Married NE Basic M Urban 68 8 42 0 1 Hail Branch Sports Car Medsize
49978 College Employed 62396 Married NE Extended M Suburban 64 2 54 0 1 Collision Web Two-Door Car Medsize
49979 Bachelor Employed 28909 Married IA Extended M Urban 118 34 88 0 1 Other Branch SUV Medsize
49980 Bachelor Employed 39547 Married OK Basic M Urban 134 33 37 0 1 Hail Agent Four-Door Car Small
49981 Bachelor Employed 22697 Divorced NE Basic F Suburban 80 4 94 3 3 Collision Branch SUV Medsize
49982 Bachelor Employed 85896 Married MO Extended F Suburban 61 26 31 0 6 Collision Agent Four-Door Car Medsize
49983 Bachelor Unemployed 0 Married NE Extended F Rural 67 1 28 0 2 Collision Branch Four-Door Car Large
49984 College Unemployed 0 Single MO Basic F Suburban 62 23 24 4 2 Other Agent SUV Medsize
49985 Master Employed 80540 Married MO Extended M Suburban 73 1 35 0 1 Collision Agent Luxury Car Medsize
49986 College Employed 83846 Married IA Extended F Suburban 130 3 73 0 2 Scratch/Dent Web Four-Door Car Medsize
49987 Bachelor Disabled 16101 Divorced IA Basic F Suburban 115 11 72 0 4 Hail Agent Four-Door Car Medsize
49988 High School or Below Unemployed 0 Married MO Basic F Urban 110 16 3 0 2 Scratch/Dent Agent Four-Door Car Large
49989 Master Employed 74775 Divorced NE Extended M Suburban 74 29 38 0 3 Hail Agent Two-Door Car Large
49990 High School or Below Employed 24239 Married MO Extended F Rural 128 27 48 0 9 Collision Web Two-Door Car Small
49991 High School or Below Employed 86451 Single IA Basic M Rural 92 1 87 0 1 Other Agent Four-Door Car Medsize
49992 College Medical Leave 22981 Single IA Basic M Rural 69 4 22 0 2 Scratch/Dent Call Center Two-Door Car Medsize
49993 High School or Below Employed 55274 Married MO Basic M Suburban 102 3 15 0 2 Other Agent Four-Door Car Medsize
49994 Bachelor Employed 33402 Married OK Basic F Rural 121 8 93 0 2 Hail Branch Four-Door Car Medsize
49995 High School or Below Employed 37995 Married MO Premium F Urban 63 7 63 1 1 Hail Agent SUV Small
49996 College Employed 60108 Married OK Basic M Suburban 66 11 37 0 3 Collision Agent Four-Door Car Large
49997 Master Employed 68197 Married MO Basic M Suburban 96 29 25 0 4 Collision Branch Two-Door Car Medsize
49998 Master Medical Leave 12626 Single IA Basic F Suburban 82 6 18 0 1 Scratch/Dent Agent Four-Door Car Medsize
49999 Doctor Employed 40870 Married MO Basic F Suburban 61 35 84 0 3 Collision Branch Two-Door Car Medsize

50000 rows × 17 columns

Now encode the new data so that I can predict the Claim Class values and set them as the new true lables for the challenge

In [12]:
# Take just the numerical columns, cause they don't need encoding
new_encoded_data = new_data.select_dtypes(include=numerics)
# Make the encoding for categorical vars
for var in list_string_vars:
    # Print the string variables (debugging)
    #print(var,type(var))
    lb.fit(new_data[var])
    encoding = lb.transform(new_data[var])
    labels = lb.classes_
    #make it into a data frame:
    df = pd.DataFrame(data= encoding,
                     columns = labels)
    new_encoded_data = pd.concat([new_encoded_data, df], axis=1)
# Again gender has to be separated out since it has just 2 classes
df_gender = pd.DataFrame(data=lb.fit_transform(new_data["Gender"]),
                        columns=["Male"])
# Combine Data
new_encoded_data = pd.concat([new_encoded_data, df_gender], axis=1)
new_encoded_data = pd.concat([new_encoded_data, ])
# Print columns
print(new_encoded_data.columns)
Index(['Income', 'Monthly Premium Auto', 'Months Since Last Claim',
       'Months Since Policy Inception', 'Number of Open Complaints',
       'Number of Policies', 'IA', 'KS', 'MO', 'NE', 'OK', 'Basic', 'Extended',
       'Premium', 'Bachelor', 'College', 'Doctor', 'High School or Below',
       'Master', 'Disabled', 'Employed', 'Medical Leave', 'Retired',
       'Unemployed', 'Rural', 'Suburban', 'Urban', 'Divorced', 'Married',
       'Single', 'Collision', 'Hail', 'Other', 'Scratch/Dent', 'Agent',
       'Branch', 'Call Center', 'Web', 'Four-Door Car', 'Luxury Car',
       'Luxury SUV', 'SUV', 'Sports Car', 'Two-Door Car', 'Large', 'Medsize',
       'Small', 'Male'],
      dtype='object')

And finally it's time to make and join the predictions to the new: 50,000 observations dataset. I will see how well the algorithms perform by training on the 50,000 observations with new "ground truths", and testing on the real data.

In [13]:
# There probably is an automatic way to do this but there's only 4 models so it would be time wasting
good_models = [models[0], models[2], models[4], models[7]]
good_model_names = [model_names[0], model_names[2], model_names[4], model_names[7]]
# Print good model names:
print(good_model_names)
# Initialize the results
predictions = pd.DataFrame()
# Combine the predictions
for model in good_models:
    to_join = pd.DataFrame(data=model.predict(new_encoded_data))
    predictions = pd.concat([predictions, to_join], axis=1)
# Use model names as column labels
predictions.columns = good_model_names
['Decision Tree', 'Random Forest', 'Adaboost', 'Gradient Boosting Classifier']
In [14]:
# Add the average predictions column
predictions["Mean Prediction"] = np.round(predictions.mean(axis=1))
# Print the predictions
predictions
Out[14]:
Decision Tree Random Forest Adaboost Gradient Boosting Classifier Mean Prediction
0 1 1 1 1 1.0
1 1 1 1 1 1.0
2 3 3 3 3 3.0
3 4 4 4 4 4.0
4 2 2 2 2 2.0
5 4 3 3 3 3.0
6 3 3 3 3 3.0
7 2 1 1 2 2.0
8 4 4 3 4 4.0
9 2 1 2 2 2.0
10 3 4 3 3 3.0
11 1 1 1 1 1.0
12 2 2 2 2 2.0
13 4 4 3 4 4.0
14 1 1 1 1 1.0
15 1 1 1 1 1.0
16 1 1 1 1 1.0
17 3 4 3 3 3.0
18 3 3 3 3 3.0
19 1 1 1 1 1.0
20 4 1 3 4 3.0
21 4 3 3 4 4.0
22 2 2 3 2 2.0
23 1 1 1 1 1.0
24 3 3 3 3 3.0
25 4 3 3 3 3.0
26 3 4 3 4 4.0
27 4 4 4 4 4.0
28 2 2 2 2 2.0
29 2 2 2 2 2.0
... ... ... ... ... ...
49970 4 1 3 3 3.0
49971 3 3 3 3 3.0
49972 3 4 3 3 3.0
49973 4 2 3 4 3.0
49974 2 2 2 2 2.0
49975 4 4 3 3 4.0
49976 3 3 3 3 3.0
49977 1 2 1 1 1.0
49978 1 1 1 1 1.0
49979 2 2 2 2 2.0
49980 2 2 2 2 2.0
49981 3 3 3 3 3.0
49982 2 3 2 2 2.0
49983 3 3 3 3 3.0
49984 3 4 3 3 3.0
49985 1 1 1 1 1.0
49986 4 4 4 4 4.0
49987 3 3 3 3 3.0
49988 4 4 4 4 4.0
49989 3 3 3 3 3.0
49990 4 4 3 4 4.0
49991 1 1 1 1 1.0
49992 4 3 3 3 3.0
49993 4 4 4 4 4.0
49994 4 4 4 4 4.0
49995 1 3 1 1 2.0
49996 2 2 2 2 2.0
49997 3 3 3 3 3.0
49998 1 1 1 1 1.0
49999 2 2 2 2 2.0

50000 rows × 5 columns

It's clear that in most cases all the models agree on a label, which is a good sign. The average prediction should be a good metric for the labels.

In [15]:
new_final_data = pd.concat([new_data, predictions["Mean Prediction"]],axis=1)
new_final_data.rename(columns={"Mean Prediction": "Claim Class"}, inplace = True)
new_final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_all_data_correlated_socioeconomic.csv")

Validation:

First, start with how well does each of the models trained on the original data performs on the 50000 samples just created.

In [16]:
for model in good_models:
    print(model)
    print("Model Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
    print("-"*50)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
Model Score: 93.742
--------------------------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Model Score: 71.49
--------------------------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=0)
Model Score: 84.638
--------------------------------------------------
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=True,
              warm_start=False)
Model Score: 95.304
--------------------------------------------------

And now in reverse: train the models on the 50,000 samples and test on the original dataset.

In [17]:
#Fitting and scoring
for model in good_models:
    model.fit(new_encoded_data, predictions["Mean Prediction"])
    print(model)
    print("Model Train Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
    print("Model Test Score:", model.score(all_features, all_targets)*100)
    print("-"*50)
    print("Model Done")
    print("-*"*25)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
Model Train Score: 97.06
Model Test Score: 92.2049485439019
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Model Train Score: 97.832
Model Test Score: 90.86927961462668
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=0)
Model Train Score: 49.88
Model Test Score: 48.48916137508211
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
      Iter       Train Loss   Remaining Time 
         1       59699.6153           41.38s
         2       52527.3740           46.72s
         3       46878.7573           46.40s
         4       42074.7970           48.32s
         5       38103.5642           48.30s
         6       34760.5541           48.52s
         7       31918.7093           48.73s
         8       29469.5169           48.78s
         9       27358.1924           48.72s
        10       25499.8678           47.97s
        20       14059.6603           43.44s
        30        9832.9384           37.52s
        40        8186.5207           31.21s
        50        7203.5426           25.30s
        60        6561.6444           19.80s
        70        6031.2310           14.77s
        80        5646.9979           10.03s
        90        5318.9144            4.95s
       100        5015.6444            0.00s
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=True,
              warm_start=False)
Model Train Score: 96.744
Model Test Score: 91.87650536457193
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*

Since all algorithms seem to have pretty good performance evenly across the switch all that is left is to split the data for the competition. More validation could be done if this data was used in a more meaningful way, but this is just a simple classification challenge.

In [24]:
competition_data, competition_test = train_test_split(new_final_data, test_size=0.1)
competition_data.reset_index(drop=True, inplace=True)
competition_test.reset_index(drop=True, inplace=True)
competition_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_student_data_corr.csv")
competition_test.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_test_data_corr.csv")