Search

Logistic Regression to Predict Weather Data

Note

Composed of …
Import: nn, TensorDataset, DataLoader, F
Define and load data: TensorDataset, DataLoader
Define model: nn.
Define loss function: F.
Define optimizer: torch.optim.
Build training function: here
1.
Generate predictions
2.
Calculate loss
3.
Compute gradients
4.
Update parameters using gradients
5.
Reset the gradients to zero
Train model
Training Set - used to train the model i.e. compute the loss and adjust the weights of the model using gradient descent
Validation set - used to evaluate the model while training, adjust hyperparameters (learning rate etc) and pick the best version of the model
Test set - used to compare different models, or different types of modeling approaches, and report the final accuracy of the model

Import

import torch import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder from sklearn.metrics import accuracy_score import torch.nn as nn import torch.nn.functional as F from torch.utils.data import random_split from torch.utils.data.dataloader import DataLoader # Import CSV file raw_df = pd.read_csv(f'{csv_path}')
Python
복사

Data Validation [Optional]

'''Data Validation''' # Display first few records raw_df.head() # Display data summary raw_df.info() # Display number of null values in each columns raw_df.isnull().sum() # Show label value percentage raw_df[f'{column1}'].value_counts()/len(raw_df) # Select numerical columns numerical = raw_df.select_dtypes(include=np.number).columns.tolist() # Display histogram of numerical columns raw_df[numerical].hist() # Display statistical properties of numerical variables raw_df[numerical].describe()
Python
복사

Preprocessing

'''Configure''' # Set columns input_cols = ['Location', 'MinTemp','MaxTemp','Rainfall','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Temp9am','Temp3pm','RainToday'] target_cols = ['RainTomorrow'] # Get numerical & categorical columns [No need to change] raw_df = raw_df[[*input_cols, *target_cols]] numerical_cols = raw_df.select_dtypes(include=np.number).columns.tolist() categorical_cols = raw_df.select_dtypes('object').columns.tolist() # Set column properties fillna_mean_cols = ['MinTemp','MaxTemp','Humidity3pm','Temp9am','Temp3pm'] fillna_median_cols = ['Rainfall', ] fillna_mode_cols = [] bool_cols = ['RainToday','RainTomorrow'] dropna_cols = ['RainToday','RainTomorrow'] encode_cols = ['Location'] '''--Configure--''' def fillNa(df, method, columns): for column in columns: if method == 'mean': raw_df[column] = df[column].fillna(value=df[column].mean()) elif method == 'mode': raw_df[column] = df[column].fillna(value=df[column].median()) elif method == 'median': raw_df[column] = df[column].fillna(value=df[column].mode()[0]) return df def encodeBool(df, columns): for column in columns: raw_df[column].replace({'No':0, 'Yes':1}, inplace=True) return df # drop columns with null values raw_df.dropna(subset = dropna_cols, inplace=True) # fillna values raw_df = fillNa(raw_df, 'mean', fillna_mean_cols) raw_df = fillNa(raw_df, 'mode', fillna_mode_cols) raw_df = fillNa(raw_df, 'median', fillna_median_cols) # Encode Boolean Columns raw_df = encodeBool(raw_df, encode_cols) # Scale Numerical Columns (0 to 1) scaler = MinMaxScaler() scaler.fit(raw_df[numerical_cols]) raw_df[numerical_cols] = scaler.transform(raw_df[numerical_cols]) # Encode Categorical Columns encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparese=False will always returns an array handle_unknown='ignore' will mark 0 incase of empty values encoder.fit(raw_df[encode_cols]) # Identify the full list of categories across all encode columns encoded_cols = list(encoder.get_feature_names_out(encode_cols)) raw_df[encoded_cols] = encoder.transform(raw_df[encode_cols]) # set input & target columns input_cols = [col for col in input_cols if col not in encode_cols] input_cols = [*input_cols, *encoded_cols]
Python
복사

Define and Load Data

'''Configure''' # set dataset ratio train_pct = 0.6 val_pct = 0.2 test_pct = 0.2 # DataLoader Batch Size batch_size = 100 '''--Configure--''' # set input & target columns input_df = torch.from_numpy(raw_df[[*input_cols]].to_numpy()) target_df = torch.from_numpy(raw_df[[*target_cols]].to_numpy()) # Define and split dataset dataset = TensorDataset(input_df.float(), target_df.float()) num_records = len(dataset) num_train = int(num_records*train_pct) num_val = int(num_records*val_pct) num_test = num_records-num_train-num_val train_ds, val_ds, test_ds = random_split(dataset, [num_train, num_val, num_test]) # Load dataset to the DataLoader train_loader = DataLoader(train_ds, batch_size, shuffle=True) val_loader = DataLoader(val_ds, batch_size) test_loader = DataLoader(test_ds, batch_size)
Python
복사

Define Model

'''Configure''' # Set input and label columns input_cols = [*numeric_cols[:], *categorical_cols[:-2]] output_cols = [*categorial_cols[-2]] # For training learning_rate = 0.0001 '''--Configure--''' # Neural Network Base class ClassificationBase(nn.Module): def __accuracy(self, outputs, labels): _, preds = torch.max(outputs, dim=1) return torch.tensor(torch.sum(preds == labels).item() / len(preds) def compute_loss_acc(self, batch): inputs, labels = batch outputs = self(inputs) # Generate prediction loss = F.cross_entropy(outputs, labels) # Calculate Loss acc = self.__accuracy(outputs, labels) return {'loss': loss.detach(), 'acc': acc} def combine_loss_acc(self, results): batch_losses = [x['loss'] for x in results] epoch_loss = torch.stack(batch_losses).mean() # Combine losses batch_accs = [x['acc'] for x in outputs] epoch_acc = torch.stack(batch_accs).mean() # Combine accuracies return {'loss': epoch_loss.item(), 'acc': epoch_acc.item()} def print_result(self, epoch, result): print(f"Epoch [{epoch}], loss: [{result['loss']}], acc: [{result['acc']}]") # Neural Network Model class DNN_Model(ClassificationBase): def __init__(self): super().__init__() # hidden layer 1 self.linear1 = nn.Linear(input_size, hidden_size) # input_size, hidden_size # output layer self.linear2 = nn.Linear(hidden_size, output_size) # output_size def forward(self, inputBatch): # Can Flatten input # Get intermediate outputs using hidden layer output = self.linear1(inputBatch) # Apply activation function output = F.relu(output) # Get predictions using output layer output = self.linear2(output) return output model = DNN_Model() """ cross entropy preds: [0.331, 0.668] real: [0, 1] -math.log(0.668) # Higher accuracy (probability) returns lower values Cross entropy is the negative logarithm of the predicted probability of the correct label averaged over all training samples so value(2.23) is look at e^-2.23 which is around 0.1 as the predicted probability of the correct label, on average Lower the loss, better the model """
Python
복사

Define Training Function

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD): history = [] optimizer = opt_func(model.parameters(), lr) for epoch in range(epochs): # Training Phase for batch in train_loader: loss = model.training_step(batch) loss.backward() optimizer.step() optimizer.zero_grad() # Validation phase result = evaluate(model, val_loader) model.epoch_end(epoch, result) history.append(result) return history
Python
복사

Scoring

accuracy_score(train_targets, train_preds)
Python
복사
'''Configure''' # set dataset ratio train_pct = 0.6 val_pct = 0.2 test_pct = 0.2 # DataLoader Batch Size batch_size = 100 '''--Configure--''' # set input & target columns input_df = torch.from_numpy(raw_df[[*input_cols]].to_numpy()) target_df = torch.from_numpy(raw_df[[*target_cols]].to_numpy()) # Define and split dataset dataset = TensorDataset(input_df.float(), target_df.float()) num_records = len(dataset) num_train = int(num_records*train_pct) num_val = int(num_records*val_pct) num_test = num_records-num_train-num_val train_ds, val_ds, test_ds = random_split(dataset, [num_train, num_val, num_test]) # Load dataset to the DataLoader train_loader = DataLoader(train_ds, batch_size, shuffle=True) val_loader = DataLoader(val_ds, batch_size) test_loader = DataLoader(test_ds, batch_size)
Python
복사