Note
•
Composed of …
Import: nn, TensorDataset, DataLoader, F
Define and load data: TensorDataset, DataLoader
Define model: nn.
Define loss function: F.
Define optimizer: torch.optim.
Build training function: here
1.
Generate predictions
2.
Calculate loss
3.
Compute gradients
4.
Update parameters using gradients
5.
Reset the gradients to zero
Train model
•
Training Set - used to train the model i.e. compute the loss and adjust the weights of the model using gradient descent
•
Validation set - used to evaluate the model while training, adjust hyperparameters (learning rate etc) and pick the best version of the model
•
Test set - used to compare different models, or different types of modeling approaches, and report the final accuracy of the model
Import
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
from torch.utils.data.dataloader import DataLoader
# Import CSV file
raw_df = pd.read_csv(f'{csv_path}')
Python
복사
Data Validation [Optional]
'''Data Validation'''
# Display first few records
raw_df.head()
# Display data summary
raw_df.info()
# Display number of null values in each columns
raw_df.isnull().sum()
# Show label value percentage
raw_df[f'{column1}'].value_counts()/len(raw_df)
# Select numerical columns
numerical = raw_df.select_dtypes(include=np.number).columns.tolist()
# Display histogram of numerical columns
raw_df[numerical].hist()
# Display statistical properties of numerical variables
raw_df[numerical].describe()
Python
복사
Preprocessing
'''Configure'''
# Set columns
input_cols = ['Location', 'MinTemp','MaxTemp','Rainfall','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Temp9am','Temp3pm','RainToday']
target_cols = ['RainTomorrow']
# Get numerical & categorical columns [No need to change]
raw_df = raw_df[[*input_cols, *target_cols]]
numerical_cols = raw_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = raw_df.select_dtypes('object').columns.tolist()
# Set column properties
fillna_mean_cols = ['MinTemp','MaxTemp','Humidity3pm','Temp9am','Temp3pm']
fillna_median_cols = ['Rainfall', ]
fillna_mode_cols = []
bool_cols = ['RainToday','RainTomorrow']
dropna_cols = ['RainToday','RainTomorrow']
encode_cols = ['Location']
'''--Configure--'''
def fillNa(df, method, columns):
for column in columns:
if method == 'mean':
raw_df[column] = df[column].fillna(value=df[column].mean())
elif method == 'mode':
raw_df[column] = df[column].fillna(value=df[column].median())
elif method == 'median':
raw_df[column] = df[column].fillna(value=df[column].mode()[0])
return df
def encodeBool(df, columns):
for column in columns:
raw_df[column].replace({'No':0, 'Yes':1}, inplace=True)
return df
# drop columns with null values
raw_df.dropna(subset = dropna_cols, inplace=True)
# fillna values
raw_df = fillNa(raw_df, 'mean', fillna_mean_cols)
raw_df = fillNa(raw_df, 'mode', fillna_mode_cols)
raw_df = fillNa(raw_df, 'median', fillna_median_cols)
# Encode Boolean Columns
raw_df = encodeBool(raw_df, encode_cols)
# Scale Numerical Columns (0 to 1)
scaler = MinMaxScaler()
scaler.fit(raw_df[numerical_cols])
raw_df[numerical_cols] = scaler.transform(raw_df[numerical_cols])
# Encode Categorical Columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparese=False will always returns an array handle_unknown='ignore' will mark 0 incase of empty values
encoder.fit(raw_df[encode_cols]) # Identify the full list of categories across all encode columns
encoded_cols = list(encoder.get_feature_names_out(encode_cols))
raw_df[encoded_cols] = encoder.transform(raw_df[encode_cols])
# set input & target columns
input_cols = [col for col in input_cols if col not in encode_cols]
input_cols = [*input_cols, *encoded_cols]
Python
복사
Define and Load Data
'''Configure'''
# set dataset ratio
train_pct = 0.6
val_pct = 0.2
test_pct = 0.2
# DataLoader Batch Size
batch_size = 100
'''--Configure--'''
# set input & target columns
input_df = torch.from_numpy(raw_df[[*input_cols]].to_numpy())
target_df = torch.from_numpy(raw_df[[*target_cols]].to_numpy())
# Define and split dataset
dataset = TensorDataset(input_df.float(), target_df.float())
num_records = len(dataset)
num_train = int(num_records*train_pct)
num_val = int(num_records*val_pct)
num_test = num_records-num_train-num_val
train_ds, val_ds, test_ds = random_split(dataset, [num_train, num_val, num_test])
# Load dataset to the DataLoader
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
test_loader = DataLoader(test_ds, batch_size)
Python
복사
Define Model
'''Configure'''
# Set input and label columns
input_cols = [*numeric_cols[:], *categorical_cols[:-2]]
output_cols = [*categorial_cols[-2]]
# For training
learning_rate = 0.0001
'''--Configure--'''
# Neural Network Base
class ClassificationBase(nn.Module):
def __accuracy(self, outputs, labels):
_, preds = torch.max(outputs, dim=1)
return torch.tensor(torch.sum(preds == labels).item() / len(preds)
def compute_loss_acc(self, batch):
inputs, labels = batch
outputs = self(inputs) # Generate prediction
loss = F.cross_entropy(outputs, labels) # Calculate Loss
acc = self.__accuracy(outputs, labels)
return {'loss': loss.detach(), 'acc': acc}
def combine_loss_acc(self, results):
batch_losses = [x['loss'] for x in results]
epoch_loss = torch.stack(batch_losses).mean() # Combine losses
batch_accs = [x['acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean() # Combine accuracies
return {'loss': epoch_loss.item(), 'acc': epoch_acc.item()}
def print_result(self, epoch, result):
print(f"Epoch [{epoch}], loss: [{result['loss']}], acc: [{result['acc']}]")
# Neural Network Model
class DNN_Model(ClassificationBase):
def __init__(self):
super().__init__()
# hidden layer 1
self.linear1 = nn.Linear(input_size, hidden_size) # input_size, hidden_size
# output layer
self.linear2 = nn.Linear(hidden_size, output_size) # output_size
def forward(self, inputBatch):
# Can Flatten input
# Get intermediate outputs using hidden layer
output = self.linear1(inputBatch)
# Apply activation function
output = F.relu(output)
# Get predictions using output layer
output = self.linear2(output)
return output
model = DNN_Model()
"""
cross entropy
preds: [0.331, 0.668]
real: [0, 1]
-math.log(0.668)
# Higher accuracy (probability) returns lower values
Cross entropy is the negative logarithm of the predicted probability of the correct label averaged over all training samples
so value(2.23) is look at e^-2.23 which is around 0.1 as the predicted probability of the correct label, on average
Lower the loss, better the model
"""
Python
복사
Define Training Function
def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
history = []
optimizer = opt_func(model.parameters(), lr)
for epoch in range(epochs):
# Training Phase
for batch in train_loader:
loss = model.training_step(batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation phase
result = evaluate(model, val_loader)
model.epoch_end(epoch, result)
history.append(result)
return history
Python
복사
Scoring
accuracy_score(train_targets, train_preds)
Python
복사
'''Configure'''
# set dataset ratio
train_pct = 0.6
val_pct = 0.2
test_pct = 0.2
# DataLoader Batch Size
batch_size = 100
'''--Configure--'''
# set input & target columns
input_df = torch.from_numpy(raw_df[[*input_cols]].to_numpy())
target_df = torch.from_numpy(raw_df[[*target_cols]].to_numpy())
# Define and split dataset
dataset = TensorDataset(input_df.float(), target_df.float())
num_records = len(dataset)
num_train = int(num_records*train_pct)
num_val = int(num_records*val_pct)
num_test = num_records-num_train-num_val
train_ds, val_ds, test_ds = random_split(dataset, [num_train, num_val, num_test])
# Load dataset to the DataLoader
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
test_loader = DataLoader(test_ds, batch_size)
Python
복사

