\documentclass[12pt,a4paper]{article}
\usepackage{graphicx}
\graphicspath{{eps/}{pdf/}}
\pagestyle{empty}
\oddsidemargin 2.1mm
\textwidth 155mm
\topmargin -15mm
\textheight 240mm
\def\net{\mathop{\rm net}\nolimits}
\begin{document}
%-----------------------------------------------------------------------
\noindent
{\bf Artificial Neural Networks and Deep Learning}
\hfill Summer 2018 \\
Christian Borgelt and Christoph Doell \hfill from 2018.06.19
%-----------------------------------------------------------------------
\vskip3ex
\centerline{\bf Exercise Sheet 7}
%-----------------------------------------------------------------------
\subsubsection*{Exercise 27 \quad\rm
Activation Function and Classification}
\begin{enumerate}\itemsep0pt
\item[a)\hfill]
Suppose in a multi-layer perceptron the hyperbolic tangent
({\em tangens hyperbolicus\/}) is used as the activation function.
Derive the factor in the backpropagation formula that results
from this activation function! Try to find a similarly simple
expression as for the logistic function (cf.\ the derivation
in the lecture)!
\item[b)\hfill]
Consider a three class problem, that is, a data set, in which
each sample case is assigned to one class (and one class only).
The Iris data discussed in the lecture may serve as an example.
In order to solve this problem with a multi-layer perceptron,
i.e., to create/train a neural network classifier, one has to
choose the number of output neurons and the desired output
values for these neurons. What number of output neurons would
you choose and what desired output values would you assign
if the activation function is (1) the logistic function or
(2) the hyperbolic tangent ({\em tangens hyperbolicus\/})?
Is it a good idea to derive the desired values from the
saturation values of these functions? Justify your answer!
\end{enumerate}
%-----------------------------------------------------------------------
\subsubsection*{Exercise 28 \quad\rm Convolutional Neural Networks}
In image processing grayscale images are often seen as a binary function
$f(x,y)$ ($x$ and $y$: coordinates of a pixel, function value: gray
value of the pixel) and represented as a matrix. Consider the following
image~$\mathbf{A}$:
\[ \mathbf{A} = \left[\begin{array}{cccccc}
0 & 0 & 0 & 0 & 0 & 0 \\
0 & 10 & 10 & 10 & 10 & 0 \\
0 & 10 & 10 & 10 & 10 & 0 \\
0 & 10 & 10 & 0 & 0 & 0 \\
0 & 10 & 10 & 0 & 0 & 0 \\
0 & 0 & 0 & 0 & 0 & 0
\end{array}\right] \]
Convolutional Neural Networks are able to efficiently process image data
independent of the position and rotation of objects that are visible in
the images. For this so-called kernel functions are used, which are
{\em convolved\/} with the image matrix. The computed
{\em convolved features\/} are then used to recognize objects.
A possible convolved feature is the existence of an edge. If one assumes
that at an edge the brightness changes significantly, edges may be
detected as maxima of the first derivative of the image function~$f$.
This is the idea underlying the so-called Sobel operator, which
consists of two sub-operators or kernel functions, namely
\[ \mathbf{S}_x = \left[\begin{array}{rrr}
\phantom{-}1 & \phantom{-}0 & -1 \\
2 & 0 & -2 \\
1 & 0 & -1
\end{array}\right]
\qquad\mbox{and}\qquad
\mathbf{S}_y = \left[\begin{array}{rrr}
1 & 2 & 1 \\
0 & 0 & 0 \\
-1 & -2 & -1
\end{array}\right]. \]
\newpage
\begin{itemize}
\item[a)\hfill]
Compute the convolved features~$\mathbf{G}_x$ and
$\mathbf{G}_y$ by sectionwise multiplication (convolution) of
the operators~$\mathbf{S}_x$ and $\mathbf{S}_y$ with the
matrix~$\mathbf{A}$, i.e.\
$\mathbf{G}_x = \mathbf{S}_x * \mathbf{A}$ and
$\mathbf{G}_y = \mathbf{S}_y * \mathbf{A}$!
(A spreadsheet program may be your friend.)
\item[b)\hfill]
From the resulting direction-dependent matrices~$\mathbf{G}_x$
and~$\mathbf{G}_y$ a direc\-tion-in\-de\-pen\-dent
matrix~$\mathbf{G} = (g_{ij})_{i,j=1,\ldots,6}$ is to be computed.
For this the entries $g_{x,ij}$ and $g_{y,ij}$ of the
direction-dependent matrices $\mathbf{G}_x$ and~$\mathbf{G}_y$,
respectively, are squared, then summed, and finally the square
root of the result is computed, that is,
$g_{ij} = \sqrt{g_{x,ij}^2 + g_{y,ij}^2}$.
Compute the matrix~$\mathbf{G}$ with the help of the results
obtained in part~a) and describe the final result!
\end{itemize}
%-----------------------------------------------------------------------
\subsubsection*{Exercise 29 \quad\rm Deep Learning: $n$-bit Parity}
In the lecture it was shown how the $n$-bit parity function can
be computed by a chain consisting of one {\em biimplication\/} network
and $n-2$ {\em exclusive or\/} networks. Show how the $n$-bit parity
function may also be computed by a binary tree of sub-networks, each
of which computes the biimplication! How many layers does the resulting
network have (as a function of~$n$)? How many neurons does is contain
in total (as a function of~$n$)?
%-----------------------------------------------------------------------
\subsubsection*{Exercise 30 \quad\rm Deep Learning: Dropout}
\begin{enumerate}\itemsep0pt
\item[a)\hfill]
Consider a 12-layer perceptron with 10~input neurons,
10~neurons in each hidden layer and 1 output neuron.
How many parameters in total have to be trained in this network?
\item[b)\hfill]
The network is to be trained with the dropout approach.
How many parameters have to be considered per training step
if due to the dropout (on average) 5~neurons are deactivated
per layer?
\item[c)\hfill]
How does the (expected) number of parameters to consider depend
on the dropout rate, that is, the fraction of deactivated neurons
per layer?
\end{enumerate}
%-----------------------------------------------------------------------
\end{document}