\documentclass[twocolumn]{article}

\setlength{\oddsidemargin}{-0.5in}
\setlength{\evensidemargin}{-0.5in}
\setlength{\topmargin}{0.0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\textwidth}{7.0in}
\setlength{\textheight}{9.5in}
\setlength{\parindent}{0in}
\setlength{\parskip}{0.05in}
\setlength{\columnseprule}{0.3pt}
\usepackage{fancyvrb}
\usepackage{relsize}
\usepackage{hyperref}

\begin{document}

Name: \_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

Directions: {\bf \Large Work only on this sheet} (on both sides, if
needed); do not turn in any supplementary sheets of paper. There is
actually plenty of room for your answers, as long as you organize
yourself BEFORE starting writing.

{\bf \Large Unless otherwise stated, give numerical answers as
expressions, e.g. $\frac{2}{3} \times 6 - 1.8$.  Do NOT use
calculators.}

{\bf 1.} (35) The code below implements the Jacobi algoritm in OpenMP.
Fill in the blanks, and add any lines necessary.  For the latter action,
write something like, ``Place the following code between lines 8 and
9.'' Do NOT delete or change lines.

\begin{Verbatim}[fontsize=\relsize{-2},numbers=left]
#include <omp.h>

// partitions s..e into nc chunks, placing the 
// ith in first and last 
void chunker(int s, int e, int nc, int i, 
   int *first, int *last)
{  int chunksize = (e-s+1) / nc;
   *first = s + i * chunksize;
   if (i < nc-1) *last = *first + chunksize - 1;
   else *last = e;
}

// returns the "dot product" of vectors u and v
float innerprod(float *u, float *v, int n)
{  float sum = 0.0; int i;
   for (i = 0; i < n; i++)
      sum += u[i] * v[i];
   return sum;
} 

// solves AX = Y, A nxn; stop iteration when 
// total change is < n*eps
void jacobi(float *a, float *x, float *y, int n, float eps)
{
   float *oldx = malloc(n*sizeof(float));
   float se;
   #pragma omp parallel
   {  int i;
      int thn = omp_get_thread_num();
      int nth = omp_get_num_threads();
      int first,last;
      chunker(0,n-1,nth,thn,&first,&last);
      for (i = first; i <= last; i++) oldx[i] = x[i] = 1.0;
      float tmp;
      while (1) {
         for (i = first; i <= last; i++) {
            tmp = innerprod(_____________________________________)
            tmp -= a[n*i+i] * oldx[i];
            _____________________________________________________;
         }
         for (i = first; i <= last; i++) 
            se += abs(x[i]-oldx[i]);
         ___________________________________________________;
         for (i = first; i <= last; i++) 
            oldx[i] = x[i];
      }
   }
}
\end{Verbatim}

{\bf 2.} (35) The code below implements the odd/even transposition sort in
CUDA.  Fill in the blanks, and add any lines necessary.  For the latter
action, write something like, ``Place the following code between lines 8
and 9.'' Do NOT delete or change lines.

\begin{Verbatim}[fontsize=\relsize{-2},numbers=left]
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// compare and swap, thread number "me"; copies from f to t, 
// swapping f[i] and f[j] if the higher-index value is smaller; 
// it is required that i < j
________________cas(int *f,int *t,int i,int j, int n, int me)
{  
   if (i < 0 || j >= n) return;
   if (me == i) {
      if (f[i] > f[j]) t[me] = f[j];
      else t[me] = f[i];
   } else {  // me == j
      if (f[i] > f[j]) t[me] = f[i];
      else t[me] = f[j];
   }
}

// does one iteration of the sort
__global__ void oekern(int *da, int *daaux, int n, int iter)
{  int bix = ______________________________________________;
   if (iter % 2) { 
      if (bix % 2) cas(da,daaux,bix-1,bix,n,bix);
      else cas(da,daaux,bix,bix+1,n,bix);
   } else {
      if (bix % 2) cas(da,daaux,bix,bix+1,n,bix);
      else cas(da,daaux,bix-1,bix,n,bix);
   }
}

// sorts the array ha, length n, using odd/even transp. sort; 
// kept simple for illustration, no optimization 
void oddeven(int *ha, int n)
{
   int *da; 
   int dasize = n * sizeof(int);
   cudaMalloc((void **)&da,dasize);
   cudaMemcpy(da,ha,dasize,cudaMemcpyHostToDevice);
   // the array daaux will serve as "scratch space," a copy of da; will
   // be using cudaMemcpy() with cudaMemcpyDeviceToDevice argument to 
   // copy back
   // we need daaux because the hardware lacks _________________________ 
   // __________________________________________________________________
   int *daaux;  
   cudaMalloc((void **)&daaux,dasize);
   cudaMemcpy(daaux,ha,dasize,cudaMemcpyHostToDevice);
   dim3 dimGrid(n,1);
   dim3 dimBlock(1,1,1);
   for (int iter = 1; iter <= n; iter++) {
   }
   cudaMemcpy(ha,da,dasize,cudaMemcpyDeviceToHost);
}
\end{Verbatim}

{\bf 3.} A {\bf wavefront} operation on an nxn matrix A works on
diagonals.  Here we will define the i$^{th}$ ``wave'' to consist of
$a_{i0}, a_{i-1,1},...,a_{0,i}$.  (Note that these diagonals are
perpendicular to the usual ones.)  Suppose we are developing a highly
parallel implementation, say in CUDA.  Say we set up the calculation
for wave i so that a different thread handles each element in the wave.
So, assigning from ``southwest to northeast,'' thread 0 would handle
$a_{i0}$, thread 1 would handle $a_{i-1,1}$, and so on.

\begin{itemize}

\item [(a)] (15) Fill in the blanks:  An obvious problem is that some
threads have less work to do than others.  In standard terminology, we
say that this is a {\it \_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
problem}.  Thread i will only be busy a fraction
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_ of the time, i =
0,1,...,n-1.

\item [(b)] (15) Suppose n = 7 and shared memory has 8 banks.  For which
values of i will the i$^{th}$ wave generate no bank conflicts?

\end{itemize}

{\bf Solutions:}

{\bf 1.} 

\begin{Verbatim}[fontsize=\relsize{-2}]
#include <omp.h>

// partitions s..e into nc chunks, placing the ith in first and last (i
// = 0,...,nc-1)
void chunker(int s, int e, int nc, int i, int *first, int *last)
{  int chunksize = (e-s+1) / nc;
   *first = s + i * chunksize;
   if (i < nc-1) *last = *first + chunksize - 1;
   else *last = e;
}

// returns the "dot product" of vectors u and v
float innerprod(float *u, float *v, int n)
{  float sum = 0.0; int i;
   for (i = 0; i < n; i++)
      sum += u[i] * v[i];
   return sum;
} 

// solves AX = Y, A nxn; stop iteration when total change is < n*eps
void jacobi(float *a, float *x, float *y, int n, float eps)
{
   float *oldx = malloc(n*sizeof(float));
   float se;
   #pragma omp parallel
   {  int i;
      int thn = omp_get_thread_num();
      int nth = omp_get_num_threads();
      int first,last;
      chunker(0,n-1,nth,thn,&first,&last);
      for (i = first; i <= last; i++) oldx[i] = x[i] = 1.0;
      float tmp;
      while (1) {
         for (i = first; i <= last; i++) {
            tmp = innerprod(&a[n*i],oldx,n);
            tmp -= a[n*i+i] * oldx[i];
            x[i] = (y[i] - tmp) / a[n*i+i];
         }
         #pragma omp barrier
         #pragma omp for reduction(+:se)
         for (i = first; i <= last; i++) 
            se += abs(x[i]-oldx[i]);
         #pragma omp barrier
         if (se < n*eps) break;
         for (i = first; i <= last; i++) 
            oldx[i] = x[i];
      }
   }
}
\end{Verbatim}

{\bf 2.}  Some students pointed out that swapping {\bf da} and {\bf
daaux} was better than copying.  This is actually what I intended in the
first place, but during debugging I ``temporarily'' change it to a copy,
which was easier.  I then forgot to change it back later.

\begin{Verbatim}[fontsize=\relsize{-2}]
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// compare and swap; copies from the f to t, swapping f[i] and
// f[j] if the higher-index value is smaller; it is required that i < j
__device__ void cas(int *f,int *t,int i,int j, int n, int me)
{  
   if (i < 0 || j >= n) return;
   if (me == i) {
      if (f[i] > f[j]) t[me] = f[j];
      else t[me] = f[i];
   } else {  // me == j
      if (f[i] > f[j]) t[me] = f[i];
      else t[me] = f[j];
   }
}

// does one iteration of the sort
__global__ void oekern(int *da, int *daaux, int n, int iter)
{  int bix = blockIdx.x;  // block number within grid
   if (iter % 2) { 
      if (bix % 2) cas(da,daaux,bix-1,bix,n,bix);
      else cas(da,daaux,bix,bix+1,n,bix);
   } else {
      if (bix % 2) cas(da,daaux,bix,bix+1,n,bix);
      else cas(da,daaux,bix-1,bix,n,bix);
   }
}

// sorts the array ha, length n, using odd/even transp. sort; 
// kept simple for illustration, no optimization 
void oddeven(int *ha, int n)
{
   int *da; 
   int dasize = n * sizeof(int);
   cudaMalloc((void **)&da,dasize);
   cudaMemcpy(da,ha,dasize,cudaMemcpyHostToDevice);
   // the array daaux will serve as "scratch space," a copy of da; will
   // be using cudaMemcpy() with cudaMemcpyDeviceToDevice argument to 
   // copy back
   // we need daaux because the hardware lacks facilities for
   // synchronization between blocks
   int *daaux;  
   cudaMalloc((void **)&daaux,dasize);
   cudaMemcpy(daaux,ha,dasize,cudaMemcpyHostToDevice);
   dim3 dimGrid(n,1);
   dim3 dimBlock(1,1,1);
   for (int iter = 1; iter <= n; iter++) {
      oekern<<<dimGrid,dimBlock>>>(da,daaux,n,iter);
      cudaThreadSynchronize();
      cudaMemcpy(da,daaux,dasize,cudaMemcpyDeviceToDevice);
   }
   cudaMemcpy(ha,da,dasize,cudaMemcpyDeviceToHost);
}
\end{Verbatim}

{\bf 3.} This problem was slightly misspecified, which made it a little
easier.  There really should be 2n-1 waves, not just n.

\begin{itemize}

\item [(a)] This is a {\bf load balancing} problem.  Thread i will be
busy only a fraction (n-i)/n of the time.

\item [(b)] Without loss of generality, assume that the array starts at
word address 0.  Note that consecutive elements within a wave are 6
words apart.  Then we have the following table:

\begin{tabular}{|c|c|c|}
\hline
wave & addresses & banks \\ \hline 
\hline
0 & 0 & 0 \\ \hline
1 & 1,7 & 1,7 \\ \hline
2 & 2,8,14 & 2,0,6 \\ \hline
3 & 3,9,15,21 & 3,1,7,5 \\ \hline
4 & 4,10,16,22,28 & 4,2,0,6,4 \\ \hline
5 & 5,11,17,23,29,35 & 5,3,1,7,5,3 \\ \hline
6 & 6,12,18,24,30,36,42 & 6,4,2,0,6,4,2 \\ \hline
\end{tabular}

Only threds 0, 1, 2 and 3 avoid bank conflicts.

\end{itemize}

\end{document}