Original Dataset was found online, with a source no longer active It was later perturbed so that even if the dataset was still online, no useful data can be gathered for the purposes of winning the technical challenge
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy import stats
#Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')
np.random.seed(1)
%matplotlib inline
# Import Data
initial_data = pd.read_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/initial_data.csv")
# See the first few rows
initial_data.head()
# Drop the Country, State name, Response, and Total Claim Amount Columns
initial_data = initial_data.drop(axis =1, labels=["Country", "State", "Response", "Total Claim Amount"])
# Perturb the dataset slightly so that it can't be searched on the internet easily and the results won't match.
initial_data["Claim Amount"] += np.random.normal(loc=0, scale=np.sqrt(5))
# Claim values looked low so multiply them by 5 (doesn't change anything for the challenge)
initial_data["Claim Amount"] *= 5
# See column values, to create claim severity classes
print(initial_data["Claim Amount"].describe())
q25 = initial_data["Claim Amount"].quantile(0.25)
q50 = initial_data["Claim Amount"].median()
q75 = initial_data["Claim Amount"].quantile(0.75)
# Claim classes based on the quantiles of the original (this is significantly faster than computing the quantiles at each iteration)
def create_claim_class(row):
if row["Claim Amount"] < q25:
return 1
elif row["Claim Amount"] < q50:
return 2
elif row["Claim Amount"] < q75:
return 3
else :
return 4
initial_data["Claim Class"] = initial_data.apply(lambda row: create_claim_class(row), axis=1)
initial_data["Claim Class"].describe()
initial_data["Claim Class"].value_counts()
# Define the one-hot encoder
def one_hot_encoder(df, nan_as_category = True):
original_columns = list(df.columns)
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
new_columns = [c for c in df.columns if c not in original_columns]
return df, new_columns
final_data, cat_cols = one_hot_encoder(initial_data, nan_as_category=False)
#define label encoder for the string data:
lb = preprocessing.LabelBinarizer()
#Gender has to be done separately because it has just 2 classes.
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# TODO: Automate this
list_string_vars = ("State Code", "Coverage", "Education", "EmploymentStatus", "Location Code", "Marital Status", "Claim Reason", "Sales Channel", "Vehicle Class", "Vehicle Size")
final_data = initial_data.select_dtypes(include=numerics)
final_data = final_data.drop(axis=1, labels=["Claim Class"])
for var in list_string_vars:
lb.fit(initial_data[var])
encoding = lb.transform(initial_data[var])
labels = lb.classes_
#make it into a data frame:
df = pd.DataFrame(data= encoding,
columns = labels)
final_data = pd.concat([final_data, df], axis=1)
df_gender = pd.DataFrame(data=lb.fit_transform(initial_data["Gender"]),
columns=["Male"])
final_data = pd.concat([final_data, df_gender,initial_data["Claim Class"]], axis=1)
print(final_data.columns)
#Save the final Preprocessed Data
#final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/preprocessed_data.csv")
all_features = final_data.drop(axis=1, labels =["Customer", "Claim Amount"])
all_targets = all_features.pop("Claim Class")
train_features , test_features, train_targets, test_targets = train_test_split(all_features, all_targets, test_size=0.2)
# Initialize all models in a list
models = [DecisionTreeClassifier(max_depth=8),
# SVC is overfitting like crazy, needs regularization
SVC(C=1, decision_function_shape="ovo", verbose=True),
RandomForestClassifier(max_depth=12, n_estimators=100),
MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000,alpha=0.001,tol = 0.000000001, solver="lbfgs", verbose=True, activation = "tanh"),
AdaBoostClassifier(n_estimators=200, learning_rate=0.05),
GaussianNB(),
QuadraticDiscriminantAnalysis(reg_param=0),
GradientBoostingClassifier(verbose=True),
]
# Define all the model names
model_names = ["Decision Tree",
"SVC",
"Random Forest",
"Neural Net",
"Adaboost",
"Gaussian Naive Bayes",
"QDA",
"Gradient Boosting Classifier"
]
# Print the data size
print("Training Data size: {}".format(train_features.shape))
# Loop over models instead of having separate cell per model
for name, model in zip(model_names, models):
model.random_state = 0
print("Training Model : {}".format(name))
model.fit(train_features, train_targets)
print("Done Training {}".format(name))
test_score = model.score(test_features, test_targets) * 100
train_score = model.score(train_features, train_targets) * 100
print("{} Train Score : {}".format(name, train_score))
print("{} Test Score : {}".format(name, test_score))
print("~"*50)
print("Model Done")
print("~"*50)
print("*"*50)
print("~"*50)
print("ALL MODELS DONE")
print("~"*50)
print("*"*50)
Now that we have our Models pre-trained it's time to generate some fake profiles. To account for correlations in socioeconomic data will split into socioeconomic and non, and sample from socioeconomic together. This makes the challenge a bit easier, since the socioeconomic data will almost necessarily be closer between trainand test sets, but inducing a correlation structure is not feasible in a short amount of time. I also split out car specific variables to avoid small sized SUV's and other problematic datapoints.
# Split out the socioeconomic indicators:
socioeconomic_columns = ["Education", "EmploymentStatus", "Income", "Marital Status"]
socioeconomic_data = initial_data[socioeconomic_columns]
car_columns = ["Vehicle Class", "Vehicle Size"]
car_data = initial_data[car_columns]
# Split other data out
nonsocioeconomic_data = initial_data.drop(socioeconomic_columns, axis = 1)
uncorr_data = nonsocioeconomic_data.drop(car_columns, axis = 1)
# Initialize the data frames for storing results
new_data_uncorr = pd.DataFrame()
new_data_socio = pd.DataFrame()
new_data_car = pd.DataFrame()
# Generate new uncorrelated data
for column in uncorr_data.columns:
new_data_uncorr.reset_index(inplace=True,drop=True)
to_join = uncorr_data[column].sample(50000, replace=True)
to_join.reset_index(inplace=True,drop=True)
new_data_uncorr = pd.concat([new_data_uncorr, to_join ],axis=1)
# Sample the socioeconomic data
new_data_socio = socioeconomic_data.sample(50000, replace=True)
# Sample the car data
new_data_car = car_data.sample(50000, replace = True)
# Reset inices for merging
new_data_socio.reset_index(inplace=True, drop=True)
new_data_uncorr.reset_index(inplace=True, drop=True)
new_data_car.reset_index(inplace=True, drop=True)
# Combine the data
new_data = pd.concat([new_data_socio, new_data_uncorr], axis=1)
new_data = pd.concat([new_data, new_data_car], axis=1)
new_data = new_data.drop(axis=1, labels=["Customer", "Claim Amount", "Claim Class"])
new_data
Now encode the new data so that I can predict the Claim Class values and set them as the new true lables for the challenge
# Take just the numerical columns, cause they don't need encoding
new_encoded_data = new_data.select_dtypes(include=numerics)
# Make the encoding for categorical vars
for var in list_string_vars:
# Print the string variables (debugging)
#print(var,type(var))
lb.fit(new_data[var])
encoding = lb.transform(new_data[var])
labels = lb.classes_
#make it into a data frame:
df = pd.DataFrame(data= encoding,
columns = labels)
new_encoded_data = pd.concat([new_encoded_data, df], axis=1)
# Again gender has to be separated out since it has just 2 classes
df_gender = pd.DataFrame(data=lb.fit_transform(new_data["Gender"]),
columns=["Male"])
# Combine Data
new_encoded_data = pd.concat([new_encoded_data, df_gender], axis=1)
new_encoded_data = pd.concat([new_encoded_data, ])
# Print columns
print(new_encoded_data.columns)
And finally it's time to make and join the predictions to the new: 50,000 observations dataset. I will see how well the algorithms perform by training on the 50,000 observations with new "ground truths", and testing on the real data.
# There probably is an automatic way to do this but there's only 4 models so it would be time wasting
good_models = [models[0], models[2], models[4], models[7]]
good_model_names = [model_names[0], model_names[2], model_names[4], model_names[7]]
# Print good model names:
print(good_model_names)
# Initialize the results
predictions = pd.DataFrame()
# Combine the predictions
for model in good_models:
to_join = pd.DataFrame(data=model.predict(new_encoded_data))
predictions = pd.concat([predictions, to_join], axis=1)
# Use model names as column labels
predictions.columns = good_model_names
# Add the average predictions column
predictions["Mean Prediction"] = np.round(predictions.mean(axis=1))
# Print the predictions
predictions
It's clear that in most cases all the models agree on a label, which is a good sign. The average prediction should be a good metric for the labels.
new_final_data = pd.concat([new_data, predictions["Mean Prediction"]],axis=1)
new_final_data.rename(columns={"Mean Prediction": "Claim Class"}, inplace = True)
new_final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_all_data_correlated_socioeconomic.csv")
First, start with how well does each of the models trained on the original data performs on the 50000 samples just created.
for model in good_models:
print(model)
print("Model Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
print("-"*50)
And now in reverse: train the models on the 50,000 samples and test on the original dataset.
#Fitting and scoring
for model in good_models:
model.fit(new_encoded_data, predictions["Mean Prediction"])
print(model)
print("Model Train Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
print("Model Test Score:", model.score(all_features, all_targets)*100)
print("-"*50)
print("Model Done")
print("-*"*25)
Since all algorithms seem to have pretty good performance evenly across the switch all that is left is to split the data for the competition. More validation could be done if this data was used in a more meaningful way, but this is just a simple classification challenge.
competition_data, competition_test = train_test_split(new_final_data, test_size=0.1)
competition_data.reset_index(drop=True, inplace=True)
competition_test.reset_index(drop=True, inplace=True)
competition_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_student_data_corr.csv")
competition_test.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_test_data_corr.csv")