import torch


import matplotlib.pyplot as plt
import numpy as np

#%config InlineBackend.figure_format = 'svg' 
plt.style.use('bmh')


import pandas as pd

train_data = pd.read_csv('data/adult_train.csv')

n_cols = len(train_data.columns)
Xtrain = train_data.iloc[:, :n_cols-1].to_dict('records')
Ytrain = train_data.iloc[:, n_cols-1]

test_data = pd.read_csv('data/adult_test.csv')
Xtest = test_data.iloc[:, :n_cols-1].to_dict('records')
Ytest = test_data.iloc[:, n_cols-1]


from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# A Pipeline combining the vectorizer and scaler.
pipeline = make_pipeline(
    DictVectorizer(sparse=False),
    StandardScaler(),
)

# Convert the Adult dataset into a matrix.
Xv = pipeline.fit_transform(Xtrain)

# Apply the same transformation to the test set.
Xtest_v = pipeline.transform(Xtest)


Xv.shape, Xtest_v.shape

((32561, 107), (16281, 107))


Yv = np.array(Ytrain) == '>50K'
Ytest_v = np.array(Ytest) == '>50K'


Yv.shape

(32561,)


import torch
from torch import nn

from torch.utils.data import DataLoader


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_classifier(model, X, Y):
    # Some hyperparameters to control the training process.
    lr = 1e-2          # Learning rate for SGD.
    n_epochs = 50      # Number of epochs (training set iterations)
    batch_size = 100   # Size of each minibatch (subset of training data)
    val_size = 0.2     # Proportion of the dataset that will be set aside for validation.
    
    # Train/test split using sklearn utility function.
    Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, random_state=0, test_size=val_size)
       
    # The optimizer defines how the model is updated. We first consider
    # a straightforward Stochastic Gradient Descent and the switch to Adam.
    # Note that it has to be told which parameters to update.
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)    
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)    
    
    # The loss function is binary cross-entropy (log-loss).
    # NOTE: we use BCEWithLogitsLoss since we don't expect a sigmoid
    # to have been applied to the output layer.
    loss_func = torch.nn.BCEWithLogitsLoss()
   
    # A DataLoader creates minibatches for the training iterations.
    # We set shuffle=True so that we process instances in a random order.
    loader = DataLoader(list(zip(Xtrain, Ytrain)), 
                        shuffle=True, batch_size=batch_size)

    # We log accuracies for plotting later.
    acc_history = []
    
    for epoch in range(n_epochs):

        # We set the model in "training mode". This only affects 
        # components such as dropout that has a different behavior 
        # during training. Otherwise, it has no effect.
        model.train()
        
        loss_sum = 0
        # In each iteration, go through all the batches in the training set.
        for Xbatch, Ybatch in loader:
                        
            # PyTorch models by default use 32-bit integers. We convert
            # the tensors into 32-bit format.
            Xbatch = Xbatch.type(torch.FloatTensor)
            Ybatch = Ybatch.type(torch.FloatTensor)

            # Forward pass: compute the model outputs over the whole batch.
            outputs = model(Xbatch)
            
            # Compute the loss function over the whole batch.
            # The .view(-1) is a small annoyance to convert a
            # 2-dimensional tensor into a 1-dimensional, because
            # Ybatch is 1-dimensional.
            loss = loss_func(outputs.view(-1), Ybatch)
            
            # Get rid of previously computed gradients.
            optimizer.zero_grad()
            # Backward pass: compute the gradients with respect to the
            # model's parameters.
            loss.backward()
            # Update the model.
            optimizer.step()
            
            # Aggregate the loss for statistics.
            loss_sum += loss.item()
               
        # Set the model in "evaluation mode", turning off dropouts if present.
        model.eval()
        # We don't want to update the model now, so we can turn off gradient
        # computation (torch.no_grad()) for a small gain in efficiency.
        with torch.no_grad():
            train_acc = predict_and_evaluate(model, Xtrain, Ytrain)
            val_acc = predict_and_evaluate(model, Xval, Yval)
                
        mean_loss = loss_sum / len(loader)

        acc_history.append((train_acc, val_acc))
        
        if (epoch+1) % 5 == 0:
            print(f'Epoch {epoch+1}: loss = {mean_loss:.4f}, train acc = {train_acc:.4f}, val acc = {val_acc:.4f}')
    
    return acc_history
        
    
# A utility function to compute accuracies during training.
def predict_and_evaluate(model, X, Y):
    # Let's assume the dataset is small enough! Otherwise, use a dataloader.
    Xt = torch.tensor(X).type(torch.FloatTensor)
    
    scores = model(Xt).view(-1)
    guesses = (scores > 0).numpy()
    return accuracy_score(Y, guesses)


torch.random.manual_seed(0)
n_input_features = Xv.shape[1]

linear_model = nn.Linear(in_features=n_input_features, out_features=1)

mlp_model = nn.Sequential(
    nn.Linear(in_features=n_input_features, out_features=256),
    nn.ReLU(),
    nn.Linear(in_features=256, out_features=1)
)

history = train_classifier(mlp_model, Xv, Yv)

Epoch 5: loss = 0.3427, train acc = 0.8427, val acc = 0.8394
Epoch 10: loss = 0.3275, train acc = 0.8480, val acc = 0.8429
Epoch 15: loss = 0.3209, train acc = 0.8512, val acc = 0.8463
Epoch 20: loss = 0.3158, train acc = 0.8539, val acc = 0.8478
Epoch 25: loss = 0.3117, train acc = 0.8553, val acc = 0.8488
Epoch 30: loss = 0.3083, train acc = 0.8571, val acc = 0.8489
Epoch 35: loss = 0.3055, train acc = 0.8575, val acc = 0.8494
Epoch 40: loss = 0.3032, train acc = 0.8590, val acc = 0.8500
Epoch 45: loss = 0.3008, train acc = 0.8598, val acc = 0.8508
Epoch 50: loss = 0.2989, train acc = 0.8613, val acc = 0.8506


plt.plot([a[0] for a in history])
plt.plot([a[1] for a in history])

plt.legend(['training accuracy', 'validation accuracy']);


predict_and_evaluate(mlp_model, Xtest_v, Ytest_v)

0.8546772311283091

Implementing classifiers with PyTorch¶

Preprocessing¶

Training a model with PyTorch¶