Data Creation Process for ASNA 2019 Case Competition Technical Challenge¶

Original Dataset was found online, with a source no longer active It was later perturbed so that even if the dataset was still online, no useful data can be gathered for the purposes of winning the technical challenge

Please note that this is a rough version of the notebook written on a short notice. I will post the cleaned up version when I have the moment.¶

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy import stats
#Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)
%matplotlib inline

# Import Data
initial_data = pd.read_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/initial_data.csv")
# See the first few rows
initial_data.head()

# Drop the Country, State name, Response, and Total Claim Amount Columns
initial_data = initial_data.drop(axis =1, labels=["Country", "State", "Response", "Total Claim Amount"])
# Perturb the dataset slightly so that it can't be searched on the internet easily and the results won't match.
initial_data["Claim Amount"] += np.random.normal(loc=0, scale=np.sqrt(5))
# Claim values looked low so multiply them by 5 (doesn't change anything for the challenge)
initial_data["Claim Amount"] *= 5

# See column values, to create claim severity classes
print(initial_data["Claim Amount"].describe())
q25 = initial_data["Claim Amount"].quantile(0.25)
q50 = initial_data["Claim Amount"].median()
q75 = initial_data["Claim Amount"].quantile(0.75)

# Claim classes based on the quantiles of the original (this is significantly faster than computing the quantiles at each iteration)
def create_claim_class(row):
    if row["Claim Amount"] < q25:
        return 1
    elif row["Claim Amount"] < q50:
        return 2
    elif row["Claim Amount"] < q75:
        return 3
    else :
        return 4

count     9134.000000
mean      4020.630971
std       3435.483804
min        967.164571
25%       2015.286630
50%       2908.251832
75%       4499.244254
max      41680.851328
Name: Claim Amount, dtype: float64

initial_data["Claim Class"] = initial_data.apply(lambda row: create_claim_class(row), axis=1)
initial_data["Claim Class"].describe()
initial_data["Claim Class"].value_counts()

3    2288
4    2284
1    2284
2    2278
Name: Claim Class, dtype: int64

# Define the one-hot encoder 
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

final_data, cat_cols = one_hot_encoder(initial_data, nan_as_category=False)

#define label encoder for the string data:
lb = preprocessing.LabelBinarizer()

#Gender has to be done separately because it has just 2 classes.
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# TODO: Automate this 
list_string_vars = ("State Code", "Coverage", "Education", "EmploymentStatus", "Location Code", "Marital Status", "Claim Reason", "Sales Channel", "Vehicle Class", "Vehicle Size")

final_data = initial_data.select_dtypes(include=numerics)
final_data = final_data.drop(axis=1, labels=["Claim Class"])
for var in list_string_vars:
    lb.fit(initial_data[var])
    encoding = lb.transform(initial_data[var])
    labels = lb.classes_
    #make it into a data frame:
    df = pd.DataFrame(data= encoding,
                     columns = labels)
    final_data = pd.concat([final_data, df], axis=1)
df_gender = pd.DataFrame(data=lb.fit_transform(initial_data["Gender"]),
                        columns=["Male"])

final_data = pd.concat([final_data, df_gender,initial_data["Claim Class"]], axis=1)
print(final_data.columns)

#Save the final Preprocessed Data
#final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/preprocessed_data.csv")

Index(['Customer', 'Claim Amount', 'Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'IA', 'KS', 'MO',
       'NE', 'OK', 'Basic', 'Extended', 'Premium', 'Bachelor', 'College',
       'Doctor', 'High School or Below', 'Master', 'Disabled', 'Employed',
       'Medical Leave', 'Retired', 'Unemployed', 'Rural', 'Suburban', 'Urban',
       'Divorced', 'Married', 'Single', 'Collision', 'Hail', 'Other',
       'Scratch/Dent', 'Agent', 'Branch', 'Call Center', 'Web',
       'Four-Door Car', 'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car',
       'Two-Door Car', 'Large', 'Medsize', 'Small', 'Male', 'Claim Class'],
      dtype='object')

Now I will create and train a few models that will be predicting the values of "Claim Class" column using all the others¶

Later I will be using those models to greate "true" values for the enhanced dataset.¶

Currently models:¶

Decision Tree
SVM
Random Forest
Adaboost
Gaussian Naive Bayes
Multilayer Perceptron
Quadratic Discriminant Analysis
Gradient Bossting Classifier

all_features = final_data.drop(axis=1, labels =["Customer", "Claim Amount"])
all_targets = all_features.pop("Claim Class")
train_features , test_features, train_targets, test_targets = train_test_split(all_features, all_targets, test_size=0.2)

# Initialize all models in a list
models = [DecisionTreeClassifier(max_depth=8),
          # SVC is overfitting like crazy, needs regularization
          SVC(C=1, decision_function_shape="ovo", verbose=True),
          RandomForestClassifier(max_depth=12, n_estimators=100),
          MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000,alpha=0.001,tol = 0.000000001, solver="lbfgs", verbose=True, activation = "tanh"),
          AdaBoostClassifier(n_estimators=200, learning_rate=0.05),
          GaussianNB(),
          QuadraticDiscriminantAnalysis(reg_param=0),
          GradientBoostingClassifier(verbose=True),
         ]
# Define all the model names
model_names = ["Decision Tree",
               "SVC",
               "Random Forest",
               "Neural Net",
               "Adaboost",
               "Gaussian Naive Bayes",
               "QDA",
               "Gradient Boosting Classifier"
              ]
# Print the data size
print("Training Data size: {}".format(train_features.shape))

# Loop over models instead of having separate cell per model
for name, model in zip(model_names, models):
    model.random_state = 0
    print("Training Model :  {}".format(name))
    model.fit(train_features, train_targets)
    print("Done Training {}".format(name))
    test_score = model.score(test_features, test_targets) * 100
    train_score = model.score(train_features, train_targets) * 100
    print("{} Train Score : {}".format(name, train_score))
    print("{} Test Score : {}".format(name, test_score))
    print("~"*50)
    print("Model Done")
    print("~"*50)

print("*"*50)    
print("~"*50)
print("ALL MODELS DONE")
print("~"*50)
print("*"*50)

Training Data size: (7307, 48)
Training Model :  Decision Tree
Done Training Decision Tree
Decision Tree Train Score : 94.9637334063227
Decision Tree Test Score : 91.40667761357416
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  SVC
[LibSVM]Done Training SVC
SVC Train Score : 97.15341453400849
SVC Test Score : 42.8024083196497
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Random Forest
Done Training Random Forest
Random Forest Train Score : 97.94717394279459
Random Forest Test Score : 92.77504105090311
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Neural Net
Done Training Neural Net
Neural Net Train Score : 42.38401532776789
Neural Net Test Score : 41.43404488232074
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Adaboost
Done Training Adaboost
Adaboost Train Score : 80.84029013274942
Adaboost Test Score : 81.33552271483306
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Gaussian Naive Bayes
Done Training Gaussian Naive Bayes
Gaussian Naive Bayes Train Score : 63.227042561926915
Gaussian Naive Bayes Test Score : 61.63108921729611
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  QDA
Done Training QDA
QDA Train Score : 50.82797317640618
QDA Test Score : 51.450465243568686
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training Model :  Gradient Boosting Classifier
      Iter       Train Loss   Remaining Time 
         1        8685.3634            8.05s
         2        7597.4283            8.35s
         3        6739.1580            8.10s
         4        5940.5194            8.04s
         5        5289.7698            8.01s
         6        4759.1374            8.06s
         7        4310.9545            8.11s
         8        3969.5428            7.88s
         9        3645.7359            7.69s
        10        3394.6779            7.54s
        20        2062.2822            6.24s
        30        1558.1028            5.43s
        40        1314.1766            4.54s
        50        1169.6003            3.71s
        60        1076.2102            2.91s
        70        1005.6195            2.16s
        80         950.2442            1.42s
        90         901.5966            0.70s
       100         847.4721            0.00s
Done Training Gradient Boosting Classifier
Gradient Boosting Classifier Train Score : 96.16805802654989
Gradient Boosting Classifier Test Score : 93.10344827586206
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model Done
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**************************************************
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ALL MODELS DONE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**************************************************

Now that we have our Models pre-trained it's time to generate some fake profiles. To account for correlations in socioeconomic data will split into socioeconomic and non, and sample from socioeconomic together. This makes the challenge a bit easier, since the socioeconomic data will almost necessarily be closer between trainand test sets, but inducing a correlation structure is not feasible in a short amount of time. I also split out car specific variables to avoid small sized SUV's and other problematic datapoints.

# Split out the socioeconomic indicators:
socioeconomic_columns = ["Education", "EmploymentStatus", "Income", "Marital Status"]
socioeconomic_data = initial_data[socioeconomic_columns]
car_columns = ["Vehicle Class", "Vehicle Size"]
car_data = initial_data[car_columns]

# Split other data out
nonsocioeconomic_data = initial_data.drop(socioeconomic_columns, axis = 1)
uncorr_data = nonsocioeconomic_data.drop(car_columns, axis = 1)

# Initialize the data frames for storing results
new_data_uncorr = pd.DataFrame()
new_data_socio = pd.DataFrame()
new_data_car = pd.DataFrame()
# Generate new uncorrelated data
for column in uncorr_data.columns:
    new_data_uncorr.reset_index(inplace=True,drop=True)
    to_join = uncorr_data[column].sample(50000, replace=True)
    to_join.reset_index(inplace=True,drop=True)
    new_data_uncorr = pd.concat([new_data_uncorr, to_join ],axis=1)
# Sample the socioeconomic data
new_data_socio = socioeconomic_data.sample(50000, replace=True)
# Sample the car data
new_data_car = car_data.sample(50000, replace = True)
# Reset inices for merging
new_data_socio.reset_index(inplace=True, drop=True)
new_data_uncorr.reset_index(inplace=True, drop=True)
new_data_car.reset_index(inplace=True, drop=True)
# Combine the data
new_data = pd.concat([new_data_socio, new_data_uncorr],  axis=1)
new_data = pd.concat([new_data, new_data_car], axis=1)
new_data = new_data.drop(axis=1, labels=["Customer", "Claim Amount", "Claim Class"])
new_data

Now encode the new data so that I can predict the Claim Class values and set them as the new true lables for the challenge

# Take just the numerical columns, cause they don't need encoding
new_encoded_data = new_data.select_dtypes(include=numerics)
# Make the encoding for categorical vars
for var in list_string_vars:
    # Print the string variables (debugging)
    #print(var,type(var))
    lb.fit(new_data[var])
    encoding = lb.transform(new_data[var])
    labels = lb.classes_
    #make it into a data frame:
    df = pd.DataFrame(data= encoding,
                     columns = labels)
    new_encoded_data = pd.concat([new_encoded_data, df], axis=1)
# Again gender has to be separated out since it has just 2 classes
df_gender = pd.DataFrame(data=lb.fit_transform(new_data["Gender"]),
                        columns=["Male"])
# Combine Data
new_encoded_data = pd.concat([new_encoded_data, df_gender], axis=1)
new_encoded_data = pd.concat([new_encoded_data, ])
# Print columns
print(new_encoded_data.columns)

Index(['Income', 'Monthly Premium Auto', 'Months Since Last Claim',
       'Months Since Policy Inception', 'Number of Open Complaints',
       'Number of Policies', 'IA', 'KS', 'MO', 'NE', 'OK', 'Basic', 'Extended',
       'Premium', 'Bachelor', 'College', 'Doctor', 'High School or Below',
       'Master', 'Disabled', 'Employed', 'Medical Leave', 'Retired',
       'Unemployed', 'Rural', 'Suburban', 'Urban', 'Divorced', 'Married',
       'Single', 'Collision', 'Hail', 'Other', 'Scratch/Dent', 'Agent',
       'Branch', 'Call Center', 'Web', 'Four-Door Car', 'Luxury Car',
       'Luxury SUV', 'SUV', 'Sports Car', 'Two-Door Car', 'Large', 'Medsize',
       'Small', 'Male'],
      dtype='object')

And finally it's time to make and join the predictions to the new: 50,000 observations dataset. I will see how well the algorithms perform by training on the 50,000 observations with new "ground truths", and testing on the real data.

# There probably is an automatic way to do this but there's only 4 models so it would be time wasting
good_models = [models[0], models[2], models[4], models[7]]
good_model_names = [model_names[0], model_names[2], model_names[4], model_names[7]]
# Print good model names:
print(good_model_names)
# Initialize the results
predictions = pd.DataFrame()
# Combine the predictions
for model in good_models:
    to_join = pd.DataFrame(data=model.predict(new_encoded_data))
    predictions = pd.concat([predictions, to_join], axis=1)
# Use model names as column labels
predictions.columns = good_model_names

['Decision Tree', 'Random Forest', 'Adaboost', 'Gradient Boosting Classifier']

# Add the average predictions column
predictions["Mean Prediction"] = np.round(predictions.mean(axis=1))
# Print the predictions
predictions

It's clear that in most cases all the models agree on a label, which is a good sign. The average prediction should be a good metric for the labels.

new_final_data = pd.concat([new_data, predictions["Mean Prediction"]],axis=1)
new_final_data.rename(columns={"Mean Prediction": "Claim Class"}, inplace = True)
new_final_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_all_data_correlated_socioeconomic.csv")

Validation:¶

First, start with how well does each of the models trained on the original data performs on the 50000 samples just created.

for model in good_models:
    print(model)
    print("Model Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
    print("-"*50)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
Model Score: 93.742
--------------------------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Model Score: 71.49
--------------------------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=0)
Model Score: 84.638
--------------------------------------------------
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=True,
              warm_start=False)
Model Score: 95.304
--------------------------------------------------

And now in reverse: train the models on the 50,000 samples and test on the original dataset.

#Fitting and scoring
for model in good_models:
    model.fit(new_encoded_data, predictions["Mean Prediction"])
    print(model)
    print("Model Train Score:", model.score(new_encoded_data, predictions["Mean Prediction"])*100)
    print("Model Test Score:", model.score(all_features, all_targets)*100)
    print("-"*50)
    print("Model Done")
    print("-*"*25)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
Model Train Score: 97.06
Model Test Score: 92.2049485439019
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
Model Train Score: 97.832
Model Test Score: 90.86927961462668
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=0)
Model Train Score: 49.88
Model Test Score: 48.48916137508211
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
      Iter       Train Loss   Remaining Time 
         1       59699.6153           41.38s
         2       52527.3740           46.72s
         3       46878.7573           46.40s
         4       42074.7970           48.32s
         5       38103.5642           48.30s
         6       34760.5541           48.52s
         7       31918.7093           48.73s
         8       29469.5169           48.78s
         9       27358.1924           48.72s
        10       25499.8678           47.97s
        20       14059.6603           43.44s
        30        9832.9384           37.52s
        40        8186.5207           31.21s
        50        7203.5426           25.30s
        60        6561.6444           19.80s
        70        6031.2310           14.77s
        80        5646.9979           10.03s
        90        5318.9144            4.95s
       100        5015.6444            0.00s
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=True,
              warm_start=False)
Model Train Score: 96.744
Model Test Score: 91.87650536457193
--------------------------------------------------
Model Done
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*

Since all algorithms seem to have pretty good performance evenly across the switch all that is left is to split the data for the competition. More validation could be done if this data was used in a more meaningful way, but this is just a simple classification challenge.

competition_data, competition_test = train_test_split(new_final_data, test_size=0.1)
competition_data.reset_index(drop=True, inplace=True)
competition_test.reset_index(drop=True, inplace=True)
competition_data.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_student_data_corr.csv")
competition_test.to_csv("/Users/michalmalyska/Desktop/Extracurriculars/ASNA2019/Case Comp/Final Data/competition_test_data_corr.csv")

	Decision Tree	Random Forest	Adaboost	Gradient Boosting Classifier	Mean Prediction
0	1	1	1	1	1.0
1	1	1	1	1	1.0
2	3	3	3	3	3.0
3	4	4	4	4	4.0
4	2	2	2	2	2.0
5	4	3	3	3	3.0
6	3	3	3	3	3.0
7	2	1	1	2	2.0
8	4	4	3	4	4.0
9	2	1	2	2	2.0
10	3	4	3	3	3.0
11	1	1	1	1	1.0
12	2	2	2	2	2.0
13	4	4	3	4	4.0
14	1	1	1	1	1.0
15	1	1	1	1	1.0
16	1	1	1	1	1.0
17	3	4	3	3	3.0
18	3	3	3	3	3.0
19	1	1	1	1	1.0
20	4	1	3	4	3.0
21	4	3	3	4	4.0
22	2	2	3	2	2.0
23	1	1	1	1	1.0
24	3	3	3	3	3.0
25	4	3	3	3	3.0
26	3	4	3	4	4.0
27	4	4	4	4	4.0
28	2	2	2	2	2.0
29	2	2	2	2	2.0
...	...	...	...	...	...
49970	4	1	3	3	3.0
49971	3	3	3	3	3.0
49972	3	4	3	3	3.0
49973	4	2	3	4	3.0
49974	2	2	2	2	2.0
49975	4	4	3	3	4.0
49976	3	3	3	3	3.0
49977	1	2	1	1	1.0
49978	1	1	1	1	1.0
49979	2	2	2	2	2.0
49980	2	2	2	2	2.0
49981	3	3	3	3	3.0
49982	2	3	2	2	2.0
49983	3	3	3	3	3.0
49984	3	4	3	3	3.0
49985	1	1	1	1	1.0
49986	4	4	4	4	4.0
49987	3	3	3	3	3.0
49988	4	4	4	4	4.0
49989	3	3	3	3	3.0
49990	4	4	3	4	4.0
49991	1	1	1	1	1.0
49992	4	3	3	3	3.0
49993	4	4	4	4	4.0
49994	4	4	4	4	4.0
49995	1	3	1	1	2.0
49996	2	2	2	2	2.0
49997	3	3	3	3	3.0
49998	1	1	1	1	1.0
49999	2	2	2	2	2.0

	Customer	Country	State Code	State	Claim Amount	Response	Coverage	Education	EmploymentStatus	Gender	...	Monthly Premium Auto	Months Since Last Claim	Months Since Policy Inception	Number of Policies	Claim Reason	Sales Channel	Total Claim Amount	Vehicle Class	Vehicle Size
0	1	US	KS	Kansas	276.351928	No	Basic	Bachelor	Employed	F	...	69	32	5	1	Collision	Agent	384.811147	Two-Door Car	Medsize
1	2	US	NE	Nebraska	697.953590	No	Extended	Bachelor	Unemployed	F	...	94	13	42	8	Scratch/Dent	Agent	1131.464935	Four-Door Car	Medsize
2	3	US	OK	Oklahoma	1288.743165	No	Premium	Bachelor	Employed	F	...	108	18	38	2	Collision	Agent	566.472247	Two-Door Car	Medsize
3	4	US	MO	Missouri	764.586183	No	Basic	Bachelor	Unemployed	M	...	106	18	65	7	Collision	Call Center	529.881344	SUV	Medsize
4	5	US	KS	Kansas	281.369258	No	Basic	Bachelor	Employed	M	...	73	12	44	1	Collision	Agent	138.130879	Four-Door Car	Medsize

	Education	EmploymentStatus	Income	Marital Status	State Code	Coverage	Gender	Location Code	Monthly Premium Auto	Months Since Last Claim	Months Since Policy Inception	Number of Open Complaints	Number of Policies	Claim Reason	Sales Channel	Vehicle Class	Vehicle Size
0	Doctor	Employed	73853	Single	MO	Basic	M	Suburban	71	3	8	1	1	Other	Agent	SUV	Small
1	Doctor	Employed	66889	Married	KS	Basic	F	Suburban	83	3	4	0	1	Hail	Agent	Four-Door Car	Medsize
2	College	Employed	60462	Married	NE	Extended	M	Suburban	87	20	32	3	4	Hail	Agent	Four-Door Car	Medsize
3	Bachelor	Employed	97361	Single	NE	Basic	F	Suburban	266	4	93	0	2	Collision	Agent	Four-Door Car	Medsize
4	College	Unemployed	0	Single	NE	Basic	F	Rural	68	29	94	0	4	Hail	Agent	Four-Door Car	Medsize
5	Bachelor	Medical Leave	13037	Married	MO	Extended	M	Suburban	112	5	27	0	9	Hail	Call Center	Four-Door Car	Large
6	High School or Below	Employed	34510	Divorced	IA	Extended	M	Suburban	84	1	70	0	8	Hail	Branch	Four-Door Car	Small
7	High School or Below	Employed	51255	Married	IA	Basic	F	Urban	102	11	45	0	1	Collision	Agent	Two-Door Car	Medsize
8	Bachelor	Employed	30389	Married	MO	Extended	F	Rural	139	15	41	0	6	Hail	Web	Luxury SUV	Medsize
9	Bachelor	Employed	21947	Married	MO	Basic	M	Urban	133	2	30	0	1	Other	Branch	Four-Door Car	Medsize
10	Bachelor	Employed	49853	Single	NE	Extended	M	Suburban	68	31	44	4	2	Scratch/Dent	Call Center	SUV	Medsize
11	Bachelor	Unemployed	0	Married	NE	Premium	F	Suburban	77	3	47	0	1	Collision	Branch	SUV	Medsize
12	Bachelor	Employed	70432	Married	OK	Extended	F	Urban	134	2	79	0	1	Other	Call Center	Four-Door Car	Medsize
13	College	Employed	88082	Married	MO	Premium	M	Rural	136	13	36	0	8	Other	Branch	Four-Door Car	Medsize
14	High School or Below	Employed	22244	Married	NE	Extended	F	Suburban	67	31	25	0	1	Collision	Agent	SUV	Medsize
15	High School or Below	Employed	82320	Married	IA	Extended	M	Urban	64	23	60	1	1	Scratch/Dent	Agent	SUV	Medsize
16	Bachelor	Employed	24285	Divorced	KS	Basic	F	Suburban	64	9	30	1	1	Hail	Call Center	Four-Door Car	Medsize
17	College	Employed	67326	Married	NE	Extended	M	Suburban	72	25	27	0	2	Collision	Branch	SUV	Medsize
18	Bachelor	Unemployed	0	Single	NE	Basic	M	Rural	61	22	18	0	2	Hail	Branch	Two-Door Car	Medsize
19	Bachelor	Employed	68004	Single	IA	Basic	M	Suburban	62	2	8	0	1	Collision	Agent	Sports Car	Large
20	Bachelor	Employed	36088	Single	IA	Basic	F	Suburban	283	16	77	0	1	Collision	Branch	Four-Door Car	Medsize
21	College	Employed	20396	Married	IA	Extended	F	Suburban	138	33	37	0	3	Hail	Agent	Four-Door Car	Medsize
22	Bachelor	Employed	58781	Divorced	MO	Basic	M	Suburban	73	7	0	3	3	Hail	Branch	Four-Door Car	Medsize
23	Bachelor	Employed	92296	Married	NE	Extended	F	Suburban	63	24	97	0	1	Hail	Call Center	Luxury SUV	Medsize
24	High School or Below	Employed	50549	Divorced	IA	Basic	M	Suburban	74	35	48	0	9	Hail	Call Center	Four-Door Car	Medsize
25	High School or Below	Employed	48552	Married	NE	Basic	M	Urban	68	1	94	0	2	Scratch/Dent	Branch	Two-Door Car	Medsize
26	College	Employed	91541	Married	MO	Basic	F	Rural	63	32	10	1	2	Collision	Agent	Four-Door Car	Large
27	Bachelor	Employed	33190	Divorced	OK	Basic	M	Suburban	125	15	20	0	2	Collision	Agent	SUV	Large
28	College	Employed	52982	Divorced	OK	Extended	F	Suburban	119	34	49	0	1	Collision	Agent	Sports Car	Medsize
29	Bachelor	Unemployed	0	Single	IA	Basic	F	Suburban	68	6	76	0	9	Collision	Branch	Four-Door Car	Small
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
49970	High School or Below	Employed	54586	Married	NE	Extended	F	Suburban	239	27	34	0	1	Collision	Call Center	Four-Door Car	Medsize
49971	College	Employed	69161	Married	MO	Premium	F	Suburban	73	1	23	0	8	Hail	Web	Two-Door Car	Medsize
49972	College	Employed	37937	Married	NE	Extended	M	Suburban	95	3	73	0	8	Hail	Call Center	SUV	Medsize
49973	High School or Below	Medical Leave	17120	Married	MO	Basic	M	Urban	137	19	42	1	3	Hail	Branch	Four-Door Car	Medsize
49974	Master	Unemployed	0	Married	OK	Basic	F	Urban	69	20	69	0	8	Hail	Agent	Four-Door Car	Medsize
49975	Master	Employed	88997	Divorced	MO	Basic	M	Suburban	63	3	14	3	2	Hail	Agent	Two-Door Car	Medsize
49976	College	Employed	47199	Married	MO	Basic	M	Rural	96	16	91	0	7	Collision	Agent	Four-Door Car	Medsize
49977	High School or Below	Employed	41520	Married	NE	Basic	M	Urban	68	8	42	0	1	Hail	Branch	Sports Car	Medsize
49978	College	Employed	62396	Married	NE	Extended	M	Suburban	64	2	54	0	1	Collision	Web	Two-Door Car	Medsize
49979	Bachelor	Employed	28909	Married	IA	Extended	M	Urban	118	34	88	0	1	Other	Branch	SUV	Medsize
49980	Bachelor	Employed	39547	Married	OK	Basic	M	Urban	134	33	37	0	1	Hail	Agent	Four-Door Car	Small
49981	Bachelor	Employed	22697	Divorced	NE	Basic	F	Suburban	80	4	94	3	3	Collision	Branch	SUV	Medsize
49982	Bachelor	Employed	85896	Married	MO	Extended	F	Suburban	61	26	31	0	6	Collision	Agent	Four-Door Car	Medsize
49983	Bachelor	Unemployed	0	Married	NE	Extended	F	Rural	67	1	28	0	2	Collision	Branch	Four-Door Car	Large
49984	College	Unemployed	0	Single	MO	Basic	F	Suburban	62	23	24	4	2	Other	Agent	SUV	Medsize
49985	Master	Employed	80540	Married	MO	Extended	M	Suburban	73	1	35	0	1	Collision	Agent	Luxury Car	Medsize
49986	College	Employed	83846	Married	IA	Extended	F	Suburban	130	3	73	0	2	Scratch/Dent	Web	Four-Door Car	Medsize
49987	Bachelor	Disabled	16101	Divorced	IA	Basic	F	Suburban	115	11	72	0	4	Hail	Agent	Four-Door Car	Medsize
49988	High School or Below	Unemployed	0	Married	MO	Basic	F	Urban	110	16	3	0	2	Scratch/Dent	Agent	Four-Door Car	Large
49989	Master	Employed	74775	Divorced	NE	Extended	M	Suburban	74	29	38	0	3	Hail	Agent	Two-Door Car	Large
49990	High School or Below	Employed	24239	Married	MO	Extended	F	Rural	128	27	48	0	9	Collision	Web	Two-Door Car	Small
49991	High School or Below	Employed	86451	Single	IA	Basic	M	Rural	92	1	87	0	1	Other	Agent	Four-Door Car	Medsize
49992	College	Medical Leave	22981	Single	IA	Basic	M	Rural	69	4	22	0	2	Scratch/Dent	Call Center	Two-Door Car	Medsize
49993	High School or Below	Employed	55274	Married	MO	Basic	M	Suburban	102	3	15	0	2	Other	Agent	Four-Door Car	Medsize
49994	Bachelor	Employed	33402	Married	OK	Basic	F	Rural	121	8	93	0	2	Hail	Branch	Four-Door Car	Medsize
49995	High School or Below	Employed	37995	Married	MO	Premium	F	Urban	63	7	63	1	1	Hail	Agent	SUV	Small
49996	College	Employed	60108	Married	OK	Basic	M	Suburban	66	11	37	0	3	Collision	Agent	Four-Door Car	Large
49997	Master	Employed	68197	Married	MO	Basic	M	Suburban	96	29	25	0	4	Collision	Branch	Two-Door Car	Medsize
49998	Master	Medical Leave	12626	Single	IA	Basic	F	Suburban	82	6	18	0	1	Scratch/Dent	Agent	Four-Door Car	Medsize
49999	Doctor	Employed	40870	Married	MO	Basic	F	Suburban	61	35	84	0	3	Collision	Branch	Two-Door Car	Medsize