Deep Neural Networks (CNNs,...)
This notebook aims at training a Convolutional Neural Network for classifying mnist data. we will implement a CNN in Pytorch for hand-digits on MNIST dataset.
1.Classification using CNN
2.Batch_Normalization
import torch
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torchvision
from tqdm.auto import tqdm, trange
from torch.utils.data import random_split
%matplotlib inline
import matplotlib.pyplot as plt
import torch.nn.functional as F
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
print('runing on ',device)
batch_size = 32
transform=transforms.ToTensor()
train_val_data= datasets.MNIST('./data',
train=True,
download=True,
transform=transform)
test_data= datasets.MNIST('./data',
train=False,
transform=transform)
test_data.classes
train_size=int(0.9*(len(train_val_data)))
val_size=len(train_val_data)-train_size
train_data,val_data=random_split(train_val_data,[train_size,val_size])
train_loader=torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, drop_last = True)
test_loader=torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True, drop_last = True)
val_loader=torch.utils.data.DataLoader(val_data, batch_size=32, shuffle=True, drop_last = True)
print('number of batches in train data , test data and validation data are:')
print(len(train_loader),len(test_loader),len(val_loader))#number of batches in train data and test data and val data
images,labels=next(iter(train_loader))
print(images.shape,labels.shape)
print(test_data)
x = next(iter(test_loader))[0][:10].squeeze(1) # Get a batch and choose 10 of images
fig = plt.figure(figsize=(20, 20)) # figure size in inches
for i in range(len(x)):
ax = fig.add_subplot(1,20, i + 1, xticks=[], yticks=[])
ax.imshow(x[i].numpy(),cmap='gray', interpolation='nearest')
class CNN(nn.Module):
def __init__(self,BN,Dropout):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.bn2 = nn.BatchNorm2d(64)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(64*24*24, 128)
self.fc2 = nn.Linear(128, 10)
self.relu = nn.ReLU()
self.Dropout=Dropout
self.BN=BN
def forward(self, x):
x = self.conv1(x)
if self.BN:
x=self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
if self.BN:
x=self.bn2(x)
x = self.relu(x)
if self.Dropout:
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = self.relu(x)
if self.Dropout:
x = self.dropout2(x)
x = self.fc2(x)
m=torch.nn.Softmax(dim=1)
return m(x)
#Training on val and train data
def train(net):
from tqdm.notebook import tqdm
costFunc = torch.nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(net.parameters(),lr=1e-4,momentum=0.9)
val_loss,train_loss,train_acc,val_acc=[],[],[],[]
for epoch in tqdm(range(41)):
bchloss = 0
net.train()
correct=0
total=0
for i,batch in enumerate(train_loader,0):
data,output=batch
data,output = data.to(device),output.to(device)
prediction = net(data)
loss = costFunc(prediction,output)
bchloss += int(loss.item()*1000)
optimizer.zero_grad()
loss.backward()
optimizer.step()
pred=torch.argmax(prediction,dim=1)
total += output.size(0)#batch_size
correct += (pred==output).sum().item()
train_loss.append(bchloss/(len(train_loader)*32))
train_acc.append((correct/total)*100)
if epoch%5==0:
print('***************** epoch',epoch,'*****************')
print('train loss = ',bchloss/(len(train_loader)*32))
print('train accuracy ','= ',str((correct/total)*100),'%')
correct=0
total=0
bchloss=0
for data,output in val_loader:
data,output = data.to(device),output.to(device)
prediction = net(data)
loss = costFunc(prediction,output)
bchloss += int(loss.item()*1000)
pred=torch.argmax(prediction,dim=1)
total += output.size(0)
correct += (pred==output).sum().item()
val_acc.append((correct/total)*100)
val_loss.append(bchloss/(len(val_loader)*32))
if epoch%5==0:
print('validation loss = ',bchloss/(len(val_loader)*32))
print('vaidation accuracy ','= ',str((correct/total)*100),'%')
return val_loss,train_loss,train_acc,val_acc
in the next parts we will train 3 models to analyse the effect of batch normalization and dropout
model1:model defined without using batch normalization and dropout(Batch_Normalization=False,Dropout=False)
model2:model defined without using batch normalization and dropout(Batch_Normalization=True,Dropout=False)
(Batch_Normalization=False,Dropout=True)model1 = CNN(BN=False,Dropout=False).to(device)
val1_loss,train1_loss,train1_acc,val1_acc=train(model1)
plt.figure()
plt.subplot(2,1,1)
plt.title('accuracy')
plt.plot(val1_acc,label='validation')
plt.plot(train1_acc,label='train')
plt.legend()
plt.show()
plt.figure()
plt.subplot(2,1,1)
plt.title('loss validation')
plt.plot(val1_loss,label='validation')
plt.title('loss train ')
plt.plot(train1_loss,label='train')
plt.legend()
plt.show()
correct=0
total=0
model1.eval()
for data,output in test_loader:
total += output.size(0)
data,output = data.to(device),output.to(device)
prediction = model1(data)
pred=torch.argmax(prediction,dim=1)
correct += (pred==output).sum().item()
print('test Accuracy on epoch ',30,'= ',str((correct/total)*100),'%')
how does batch normalization change training process of a CNN ?\ batch normalization helps the network in faster convergence as we can see in the plots below , the loss of the network with batch normalization reduces much faster than the network without batch normalization because of the covariance shift (shifting of hidden values for each batch of input) This causes faster converge of the network and reduces the training time.so it improves the speed, performance, and stability of our neural networks.
model2 = CNN(BN=True,Dropout=False).to(device)
val2_loss,train2_loss,train2_acc,val2_acc=train(model2)
plt.title('loss')
plt.plot(val1_loss, 'r', label='without BN')
plt.plot(val2_loss, 'g', label='with BN')
plt.legend()
plt.show()
plt.title('accuarcy')
plt.plot(val1_acc, 'm', label='without BN')
plt.plot(val2_acc, 'b', label='with BN')
plt.legend()
plt.show()
Dropout Dropout is a technique used in neural networks to prevent overfitting the training data by dropping out neurons with probability p>0 It forces the model to avoid relying too much on particular sets of features..
it helps to reduce overfitting and generalization error Dropout technique deactivates few neurons in the neural network randomly and thus it avoids overfitting. Dropout deactivates some neurons by random at each training step instead of training the data on the original network.In the next iteration of the training step, the hidden neurons which are deactivated by dropout changes because of its probabilistic behavior. In this way, by applying dropout i.e…deactivating certain individual nodes at random during training we can simulate an ensemble of neural network with different architectures.
model3 = CNN(BN=False,Dropout=True).to(device)
val3_loss,train3_loss,train3_acc,val3_acc=train(model3)
plt.title('loss')
plt.plot(val1_loss, 'r', label='without dropout')
plt.plot(val3_loss, 'g', label='with dropout')
plt.legend()
plt.show()
plt.title('accuracy')
plt.plot(val1_acc, 'm', label='without dropout')
plt.plot(val3_acc, 'b', label='with dropout')
plt.legend()
plt.show()
When we talk about filters in convolutional neural networks, then we are specifically talking about the weights. If you do a lot of practical deep learning coding, then you may know them by the name of kernels.\ These filters will determine which pixels or parts of the image the model will focus on
model1.parameters
# load the model
model_weights = []
conv_layers = []
model_children = list(model1.children())
counter = 0
for i in range(len(model_children)):
if type(model_children[i]) == nn.Conv2d:
counter += 1
model_weights.append(model_children[i].weight)
conv_layers.append(model_children[i])
elif type(model_children[i]) == nn.Sequential:
for j in range(len(model_children[i])):
for child in model_children[i][j].children():
if type(child) == nn.Conv2d:
counter += 1
model_weights.append(child.weight)
conv_layers.append(child)
print(f"Total convolutional layers: {counter}")
# the first conv layer filters visualization
plt.figure(figsize=(20, 17))
for i, filter in enumerate(model_weights[0]):
plt.subplot(8, 8, i+1)
plt.imshow(filter[0, :, :].detach().cpu()
, cmap='gray')
plt.axis('off')
plt.show()
Feature maps are what we get after a filter has passed through the pixel values of an input image. Specifically, it is what the convolutional layer sees after passing the filters on the image. It is what we call a convolution operation in terms of deep learning
images,labels=next(iter(train_loader))
results = [conv_layers[0](images.cuda())]
for i in range(1, len(conv_layers)):
results.append(conv_layers[i](results[-1]))
outputs = results
# visualizing features
for num_layer in range(len(outputs)):
plt.figure(figsize=(30, 30))
layer_viz = outputs[num_layer][0, :, :, :]
layer_viz = layer_viz.data
print(layer_viz.size())
for i, filter in enumerate(layer_viz):
if i == 64:
break
plt.subplot(8, 8, i + 1)
plt.imshow(filter.cpu(), cmap='gray')
plt.axis("off")
print(f" layer {num_layer} feature maps...")
plt.show()
plt.close()
In this jupyter file, several linear regression methods are implemented and as a result, the performance and accuracies of different algorithms was reported. The main dataset is collected data about Toyota Corolla cars information with different option and features.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from collections import Counter
data_df = pd.read_csv("./ToyotaCorolla.csv")
data_df.head()
data_df.count()
data_df.describe()
data_df.isnull().sum()
# Check Correlation amoung parameters
corr = data_df.corr()
fig, ax = plt.subplots(figsize=(8,8))
# Generate a heatmap
sns.heatmap(corr, cmap = 'magma', annot = True, fmt = ".2f")
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()
# plot regplots for Age, KM, CC & HP against Price
f, axes = plt.subplots(2,2, figsize=(12,8))
# Age Vs Price
sns.regplot(x = 'Price', y = 'Age', data = data_df, ax = axes[0,0], scatter_kws={'alpha':0.6})
axes[0,0].set_xlabel('Price', fontsize = 14)
axes[0,0].set_ylabel('Age', fontsize=14)
axes[0,0].yaxis.tick_left()
# KM Vs Price
sns.regplot(x = 'Price', y = 'KM', data = data_df, ax = axes[0,1], scatter_kws={'alpha':0.6})
axes[0,1].set_xlabel('Price', fontsize = 14)
axes[0,1].set_ylabel('KM', fontsize=14)
axes[0,1].yaxis.set_label_position("right")
axes[0,1].yaxis.tick_right()
# CC Vs Price
sns.regplot(x = 'Price', y = 'CC', data = data_df, ax = axes[1,0], scatter_kws={'alpha':0.6})
axes[1,0].set_xlabel('Price', fontsize = 14)
axes[1,0].set_ylabel('CC', fontsize=14)
axes[1,0].yaxis.tick_left()
# Weight Vs Price
sns.regplot(x = 'Price', y = 'Weight', data = data_df, ax = axes[1,1], scatter_kws={'alpha':0.6})
axes[1,1].set_xlabel('Price', fontsize = 14)
axes[1,1].set_ylabel('Weight', fontsize=14)
axes[1,1].yaxis.set_label_position("right")
axes[1,1].yaxis.tick_right()
plt.show()
# Create the clasiification.
data_df = pd.get_dummies(data_df)
data_df.head()
In statistics, simple linear regression is a linear regression model with a single explanatory variable. That is, it concerns two-dimensional sample points with one independent variable and one dependent variable (conventionally, the x and y coordinates in a Cartesian coordinate system) and finds a linear function (a non-vertical straight line) that, as accurately as possible, predicts the dependent variable values as a function of the independent variable. The adjective simple refers to the fact that the outcome variable is related to a single predictor.
from sklearn.linear_model import LinearRegression
Lets see how does our model perform if we have consider only one independent variable(Age) to predict the price.
X_simple_lreg = data_df[["Age"]].values
y_simple_lreg = data_df["Price"].values
print(X_simple_lreg[0:5])
print(y_simple_lreg[0:5])
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_slreg, X_test_slreg, y_train_slreg, y_test_slreg = train_test_split(X_simple_lreg,y_simple_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_slreg.shape, y_train_slreg.shape)
print('Test Dataset : ', X_test_slreg.shape, y_test_slreg.shape)
simple_lreg = LinearRegression()
simple_lreg.fit(X_train_slreg, y_train_slreg)
print('Intercept : ', simple_lreg.intercept_)
print('Slope : ', simple_lreg.coef_)
As we can see, the slope is -169.09, which means that price of the vehicle is highly impacted by the age of the vehicle. However, it is negatively propotional to Price.
# Use the model to predict the test dataset.
y_simplelreg_pred_test = simple_lreg.predict(X_test_slreg)
# Use the model to predict the train dataset.
y_simplelreg_pred_train = simple_lreg.predict(X_train_slreg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_slreg_train = r2_score(y_simplelreg_pred_train, y_train_slreg)
r2_score_slreg_test = r2_score(y_simplelreg_pred_test, y_test_slreg)
rmse_slreg = np.sqrt(mean_squared_error(y_simplelreg_pred_test, y_test_slreg)**2)
print('r2_ score for train dataset for simple linear reg : ', r2_score_slreg_train)
print('r2_ score for test dataset for simple linear reg : ', r2_score_slreg_test)
print('root mean squared error for simple linear reg : ', rmse_slreg)
Multiple linear regression (MLR), also known simply as multiple regression, is a statistical technique that uses several explanatory variables to predict the outcome of a response variable. The goal of multiple linear regression (MLR) is to model the linear relationship between the explanatory (independent) variables and response (dependent) variable.
Let us include some more independent variables to predict the price of the vehicle.
# Separating the independent and dependent variable.
X_multi_lreg = data_df.drop('Price', axis = 1).values
y_multi_lreg = data_df["Price"].values.reshape(-1,1)
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_mlreg, X_test_mlreg, y_train_mlreg, y_test_mlreg = train_test_split(X_multi_lreg,y_multi_lreg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_mlreg.shape, y_train_mlreg.shape)
print('Test Dataset : ', X_test_mlreg.shape, y_test_mlreg.shape)
multi_lreg = LinearRegression()
multi_lreg.fit(X_train_mlreg, y_train_mlreg)
print('Intercept : ', multi_lreg.intercept_)
print('Slope : ', multi_lreg.coef_)
# Use the model to predict the test dataset.
y_mlreg_pred_test = multi_lreg.predict(X_test_mlreg)
# Use the model to predict the train dataset.
y_mlreg_pred_train = multi_lreg.predict(X_train_mlreg)
# Have a look at the predicted & actual values.
print(y_mlreg_pred_test[0:5])
# print(y_test[0:5])
print(y_mlreg_pred_train[0:5])
# print(y_train[0:5])
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_mlreg_train = r2_score(y_mlreg_pred_train, y_train_mlreg)
r2_score_mlreg_test = r2_score(y_mlreg_pred_test, y_test_mlreg)
rmse_mlreg = np.sqrt(mean_squared_error(y_mlreg_pred_test, y_test_mlreg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_mlreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_mlreg_test)
print('root mean squared error for multi linear reg : ', rmse_mlreg)
As we can see that using multiple independent variables we can improve the accuracy of the model.
Ridge regression is a way to create a parsimonious model when the number of predictor variables in a set exceeds the number of observations, or when a data set has multicollinearity (correlations between predictor variables).
Let us look at a 2nd degree polynomial regression.
# Separating the independent and dependent variable.
X_ridge_reg = data_df.drop('Price', axis = 1).values
y_ridge_reg = data_df["Price"].values.reshape(-1,1)
# Create train test dataset
from sklearn.model_selection import train_test_split
X_train_ridge_reg, X_test_ridge_reg, y_train_ridge_reg, y_test_ridge_reg = train_test_split(X_ridge_reg,y_ridge_reg, test_size = 0.25, random_state = 4)
print('Train Dataset : ', X_train_ridge_reg.shape, y_train_ridge_reg.shape)
print('Test Dataset : ', X_test_ridge_reg.shape, y_test_ridge_reg.shape)
from sklearn.linear_model import Ridge
## training the model
ridgeReg = Ridge(alpha=0.05, normalize=True)
ridgeReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_ridgereg_pred_test = ridgeReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_ridgereg_pred_train = ridgeReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_ridgereg_train = r2_score(y_ridgereg_pred_train, y_train_ridge_reg)
r2_score_ridgereg_test = r2_score(y_ridgereg_pred_test, y_test_ridge_reg)
rmse_ridgereg = np.sqrt(mean_squared_error(y_ridgereg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_ridgereg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_ridgereg_test)
print('root mean squared error for multi linear reg : ', rmse_ridgereg)
Lasso regression is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain parts of model selection, like variable selection/parameter elimination.
from sklearn.linear_model import Lasso
## training the model
lassoReg = Lasso(alpha=0.3, normalize=True)
lassoReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_lassoreg_pred_test = lassoReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_lassoreg_pred_train = lassoReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_lassoreg_train = r2_score(y_lassoreg_pred_train, y_train_ridge_reg)
r2_score_lassoreg_test = r2_score(y_lassoreg_pred_test, y_test_ridge_reg)
rmse_lassoreg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_lassoreg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_lassoreg_test)
print('root mean squared error for multi linear reg : ', rmse_lassoreg)
from sklearn.linear_model import ElasticNet
## training the model
elasticNetReg = ElasticNet(alpha=1, l1_ratio=0.5, normalize=True)
elasticNetReg.fit(X_train_ridge_reg,y_train_ridge_reg)
# Use the model to predict the test dataset.
y_elasticNetReg_pred_test = elasticNetReg.predict(X_test_ridge_reg)
# Use the model to predict the train dataset.
y_elasticNetReg_pred_train = elasticNetReg.predict(X_train_ridge_reg)
# Calculate the eualuation metrics of the model.
from sklearn.metrics import r2_score
r2_score_elasticNetReg_train = r2_score(y_elasticNetReg_pred_train, y_train_ridge_reg)
r2_score_elasticNetReg_test = r2_score(y_elasticNetReg_pred_test, y_test_ridge_reg)
rmse_elasticNetReg = np.sqrt(mean_squared_error(y_lassoreg_pred_test, y_test_ridge_reg)**2)
print('r2_ score for train dataset for multi linear reg : ', r2_score_elasticNetReg_train)
print('r2_ score for test dataset for multi linear reg : ', r2_score_elasticNetReg_test)
print('root mean squared error for multi linear reg : ', rmse_elasticNetReg)
Models = [('Simple Linear Regression', r2_score_slreg_train, r2_score_slreg_test, rmse_slreg),
('Multiplt Linear Regression', r2_score_mlreg_train, r2_score_mlreg_test, rmse_mlreg),
('Ridge Regression', r2_score_ridgereg_train, r2_score_ridgereg_test, rmse_ridgereg),
('Lasso Regression', r2_score_lassoreg_train, r2_score_lassoreg_test, rmse_lassoreg),]
predict = pd.DataFrame(data = Models, columns = ['Models', 'r2_score Training', 'r2_score Testing', 'RMSE'])
predict
The performance of each algorithm is visualized as below:
f, axes = plt.subplots(3,1, figsize=(18,8))
sns.barplot(x='Models', y='r2_score Training', data = predict, ax = axes[0])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Training')
axes[0].set_ylim(0,1.0)
sns.barplot(x='Models', y='r2_score Testing', data = predict, ax = axes[1])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('r2_score Testing')
axes[0].set_ylim(0,1.0)
sns.barplot(x='Models', y='RMSE', data = predict, ax = axes[2])
axes[0].set_xlabel('Models')
axes[0].set_ylabel('RMSE')
axes[0].set_ylim(0,1.0)
Neaural Networks are modeled loosely on the human brain, a neural net consists of thousands or even millions of simple processing nodes that are densely interconnected. Most of today’s neural nets are organized into layers of nodes, and they’re “feed-forward,” meaning that data moves through them in only one direction. An individual node might be connected to several nodes in the layer beneath it, from which it receives data, and several nodes in the layer above it, to which it sends data.
To each of its incoming connections, a node will assign a number known as a “weight.” When the network is active, the node receives a different data item — a different number — over each of its connections and multiplies it by the associated weight. It then adds the resulting products together, yielding a single number. If that number is below a threshold value, the node passes no data to the next layer. If the number exceeds the threshold value, the node “fires,” which in today’s neural nets generally means sending the number — the sum of the weighted inputs — along all its outgoing connections.
When a neural net is being trained, all of its weights and thresholds are initially set to random values. Training data is fed to the bottom layer — the input layer — and it passes through the succeeding layers, getting multiplied and added together in complex ways, until it finally arrives, radically transformed, at the output layer. During training, the weights and thresholds are continually adjusted until training data with the same labels consistently yield similar outputs.
A multilayer perceptron (MLP) is a feedforward artificial neural network that generates a set of outputs from a set of inputs. An MLP is characterized by several layers of input nodes connected as a directed graph between the input and output layers. MLP uses backpropagation for training the network.
Multilayer Perceptron (MLP) the same thing as a Deep Neural Network(DNN)?
MLP is a subset of DNN. While DNN can have loops and MLP are always feed-forward, i.e. A Multilayer Perceptron is a finite acyclic graph.
NNs have become the infrastructure of deep learning, voice recognition and almost every successful AI agent and model in the modern world.
We have summurized the importance of neural networks and their applications, we try to bring an example to shed some light on the subject and to make it more sensable!
Below we have the steps to learn the Fashion MNIST database with neural networks and a bunch of built-in libraries.
First we have to import needed libraries:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
We import the Fashion MNIST dataset, a dataset which contains 70,000 grayscale images in 10 categories. The images show individual articles of clothing at low resolution (28 by 28 pixels).
Here, 60,000 images are used to train the network and 10,000 images to evaluate how accurately the network learned to classify images. We can access the Fashion MNIST directly from TensorFlow.
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
The labels are an array of integers, ranging from 0 to 9. These correspond to the class of clothing the image represents:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
If you inspect the first image in the training set, you will see that the pixel values fall in the range of 0 to 255, we scale these values to a range of 0 to 1 before feeding them to the neural network model. To do so, divide the values by 255.
train_images = train_images / 255.0
test_images = test_images / 255.0
In order to see if the scaling worked correctly, and to get a sense from the pictures obtain an output.
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(train_images[i], cmap=plt.cm.binary)
plt.xlabel(class_names[train_labels[i]])
plt.show()
Now we have to create our neural network. The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Hopefully, these representations are meaningful for the problem at hand.
Most of deep learning consists of chaining together simple layers. Most layers, such as tf.keras.layers.Dense, have parameters that are learned during training.
The first layer in this network, tf.keras.layers.Flatten, transforms the format of the images from a two-dimensional array (of 28 by 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). In anouther word this layer flattens the data, think of this layer as unstacking rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data.
After the pixels are flattened, the network consists of a sequence of two tf.keras.layers.Dense layers. These are densely connected, or fully connected, neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes.
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
#compiling the model
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(train_images, train_labels, epochs=10)
As the model trains, the loss and accuracy metrics are displayed. This model reaches an accuracy of about 0.91 (or 91%) on the training data.
Next, we have to test the trained model against the test data.
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)
It turns out that the accuracy on the test dataset is a little less than the accuracy on the training dataset. This gap between training accuracy and test accuracy represents overfitting. Overfitting happens when a machine learning model performs worse on new, previously unseen inputs than it does on the training data. An overfitted model "memorizes" the noise and details in the training dataset to a point where it negatively impacts the performance of the model on the new data.
With the model trained, you can use it to make predictions about some images.
Let's plot several images with their predictions. Note that the model can be wrong even when very confident.
probability_model = tf.keras.Sequential([model,
tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_images)
def plot_image(i, predictions_array, true_label, img):
true_label, img = true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'blue'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
100*np.max(predictions_array),
class_names[true_label]),
color=color)
def plot_value_array(i, predictions_array, true_label):
true_label = true_label[i]
plt.grid(False)
plt.xticks(range(10))
plt.yticks([])
thisplot = plt.bar(range(10), predictions_array, color="#777777")
plt.ylim([0, 1])
predicted_label = np.argmax(predictions_array)
thisplot[predicted_label].set_color('red')
thisplot[true_label].set_color('blue')
# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
plt.subplot(num_rows, 2*num_cols, 2*i+1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(num_rows, 2*num_cols, 2*i+2)
plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()
We also can use the model to classify a single photo:
img = test_images[20]
img = (np.expand_dims(img,0))
predictions_single = probability_model.predict(img)
plot_value_array(1, predictions_single[0], test_labels)
_ = plt.xticks(range(10), class_names, rotation=45)