\documentclass{article}

\setlength{\oddsidemargin}{-0.5in}
\setlength{\evensidemargin}{-0.5in}
\setlength{\topmargin}{0.0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\textwidth}{7.0in}
\setlength{\textheight}{9.5in}
\setlength{\parindent}{0in}
\setlength{\parskip}{0.05in}
\setlength{\columnseprule}{0.3pt}
\usepackage{fancyvrb}
\usepackage{relsize}
\usepackage{hyperref}

\usepackage{listings}

\begin{document}

Name: \_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

Directions: {\bf Work only on this sheet} (on both sides, if needed); do
not turn in any supplementary sheets of paper. There is actually plenty
of room for your answers, as long as you organize yourself BEFORE
starting writing.

{\bf 1.} (40) Write an R function that computes two-dimensional Discrete
Fourier Transforms (DFTs) in parallel.  You must use the approach that
makes use of separability, described on p.167, and use {\bf snow} as
your vehicle for parallelization.  Your function will call R's
one-dimensional DFT function {\tt fft()}; the call {\bf fft(v)} on a
vector {\bf v} returns the DFT of {\bf v}.  (That function is also
capable of two-dimensional DFTs, but you will not use that here, in
order to parallelize the operation.)

The form of your function's call will be {\bf fft2(m)}, where {\bf m} is
a matrix.  The return value will be a matrix of the same size.  Write
full code, with clear comments, and {\bf WRITE LEGIBLY}.

{\bf 2.} (30) Below is OpenMP code to compute prefix sums in parallel,
using the approach outlined at the bottom of p.130 and top of p.131.

Here is a sample call:

\begin{Verbatim}[fontsize=\relsize{-2}]

int main()
{  int i,u[9] = {5,12,13,5,4,3,8,6,1}, z[3];
   parprfsum(u,9,z);
   for (i = 0; i < 9; i++) printf("%d\n",u[i]);
}
\end{Verbatim}

The result will be (5,17,30,35,39,42,50,56,57).

Fill in the blanks in the code:

\begin{Verbatim}[fontsize=\relsize{-2}]
#include <omp.h>

// calculates prefix sums sequentially on u, where u is an 
// m-element array

void seqprfsum(int *u,int m)
{  int i,s=u[0];
   for (i = 1; i < m; i++) {
      u[i] += s;
      s = u[i];
   }
}

// OMP example, calculating prefix sums in parallel on the 
// n-element array x, in-place; for simplicity, assume that n is 
// divisible by the number of threads; z is for intermediate 
// storage, an array with length equal to the number of threads; x 
// and z point to global arrays 
void parprfsum(int *x, int n, int *z)
{
   #pragma omp parallel
   {  int i,j,me = omp_get_thread_num(),
                                         // one blank line
          chunksize = n /                // blank
          start =                        // blank
      seqprfsum(&x[start],chunksize);
      #pragma omp                        // blank
      #pragma omp                        // blank 
      { 
      for (i = 0; i < nth-1; i++) 
         z[i] =                          // blank
      seqprfsum(z,nth-1);
      }
      if (                   ) {         // blank
         for (j = start; j < start + chunksize; j++) {
            x[j]                         // blank
         }
      }
   }
}
\end{Verbatim}

{\bf 3.} (30) Below is CUDA code to solve a linear system of equations
via Gaussian elimination.  It uses the approach of p.143, except that it
reduces to the form $(I | x)$, where {\bf I} is the identity matrix and 
{\bf x} is the solution to the system.  The difference from p.143 is
that line 3 in the pseudocode is now

\begin{Verbatim}[fontsize=\relsize{-2}]
for r = 0 to n-1, r != i  
\end{Verbatim}

There is no pivoting, i.e. no swapping of rows if 0s or near-0 values
are encountered.

Fill in the blanks:

\begin{Verbatim}[fontsize=\relsize{-2}]
// linear index for matrix element at row i, column j, in an m-column
// matrix
__device__ int onedim(int i,int j,int m) {return i*m+j;}

// replace u by c* u; vector of length m
__device__ void cvec(float *u, int m, float c)
{  for (int i = 0; i < m; i++) u[i] = c * u[i];  }

// multiply the vector u of length m by the constant c (not changing u)
// and add the result to v
__device__ void vplscu(float *u, float *v, int m, float c)
{  for (int i = 0; i < m; i++) v[i] += c * u[i];  }

// copy the vector u of length m to v
__device__ void cpuv(float *u, float *v, int m)
{  for (int i = 0; i < m; i++) v[i] = u[i];  }

// solve matrix equation Ax = b; straight Gaussian elimination, no
// pivoting etc.; the matrix ab is (A|b), n rows; ab is destroyed, with
// x placed in the last column; one block, with thread i handling row i
__global__ void gauss(float *ab, int n)
{  int i,n1=n+1,abii,abme;
   extern __shared__ float iirow[];
   int me = threadIdx.x;
   for (i = 0; i < n; i++) {  
      if (                      ) {             // blank
         abii =                                 // blank
         cvec(&ab[abii],n1-i,1/ab[abii]);
         cpuv(                   );             // blank
      }
                                                // one blank line
      if (                     ) {              // blank
         abme = onedim(me,i,n1);
         vplscu(iirow,                          // blank
      }
      __syncthreads();
   }
}
\end{Verbatim}

{\bf Solutions:}

{\bf 1.} If you have discovered how {\bf parApply()} works on
vector-valued functions---placing the result of each row {\it or} column
of the input into a {\it column} of the output, you can write the code
this way:

\begin{Verbatim}[fontsize=\relsize{-2}]
fft2 <- function(cls,m) {
   tmp <- parApply(cls,m,1,fft)
   return(parApply(cls,tmp,1,fft))
}
\end{Verbatim}

The code using only fundamental ops would run along the following lines:

\begin{Verbatim}[fontsize=\relsize{-2}]
l <- list()
for each row r in m
   add r to l
call clusterApply() on l with fft(), result mm
l <- list()
for each column c in m
   add c to l
call clusterApply() on l with fft(), return result 
\end{Verbatim}

{\bf 2.}

\begin{Verbatim}[fontsize=\relsize{-2}]
#include <omp.h>

// calculates prefix sums sequentially in-place on u, where u is an
// m-element array
void seqprfsum(int *u,int m)
{  int i,s=u[0];
   for (i = 1; i < m; i++) {
      u[i] += s;
      s = u[i];
   }
}

// OMP example, calculating prefix sums in parallel on the n-element
// array x, in-place; for simplicity, assume that n is divisible by the
// number of threads; z is for intermediate storage, an array with length
// equal to the number of threads; x and z point to global arrays
void parprfsum(int *x, int n, int *z)
{
   #pragma omp parallel
   {  int i,j,me = omp_get_thread_num(),
          nth = omp_get_num_threads(),
          chunksize = n / nth,
          start = me * chunksize;
      seqprfsum(&x[start],chunksize);
      #pragma omp barrier 
      #pragma omp single 
      { 
      for (i = 0; i < nth-1; i++) 
         z[i] = x[(i+1)*chunksize - 1];
      seqprfsum(z,nth-1);
      }
      if (me > 0) {
         for (j = start; j < start + chunksize; j++) {
            x[j] += z[me - 1]; 
         }
      }
   }
}
\end{Verbatim}

{\bf 3.}

\begin{Verbatim}[fontsize=\relsize{-2}]
#include <stdio.h>

// linear index for matrix element at row i, column j, in an m-column
// matrix
__device__ int onedim(int i,int j,int m) {return i*m+j;}

// replace u by c* u; vector of length m
__device__ void cvec(float *u, int m, float c)
{  for (int i = 0; i < m; i++) u[i] = c * u[i];  }

// multiply the vector u of length m by the constant c (not changing u)
// and add the result to v
__device__ void vplscu(float *u, float *v, int m, float c)
{  for (int i = 0; i < m; i++) v[i] += c * u[i];  }

// copy the vector u of length m to v
__device__ void cpuv(float *u, float *v, int m)
{  for (int i = 0; i < m; i++) v[i] = u[i];  }

// solve matrix equation Ax = b; straight Gaussian elimination, no
// pivoting etc.; the matrix ab is (A|b), n rows; ab is destroyed, with
// x placed in the last column; one block, with thread i handling row i
__global__ void gauss(float *ab, int n)
{  int i,n1=n+1,abii,abme;
   extern __shared__ float iirow[];
   int me = threadIdx.x;
   for (i = 0; i < n; i++) {  // seq through the diagonal for pivots
      if (i == me) {
         abii = onedim(i,i,n1);
         cvec(&ab[abii],n1-i,1/ab[abii]);
         cpuv(&ab[abii],iirow,n1-i);
      }
      __syncthreads();
      if (i != me) {
         abme = onedim(me,i,n1);
         vplscu(iirow,&ab[abme],n1-i,-ab[abme]);
      }
      __syncthreads();
   }
}
\end{Verbatim}

\end{document}