Q1:  divergence

Q2:  atomicAdd()

Q3:  Lines 11 and 12 must change, to

  int totth = gridDim.x * blockDim.x * blockDim.y,  
       me = blockIdx.x * blockDim.x * blockDim.y +
               threadIdx.y * blockDim.x +
               threadIdx.x;
Here we are thinking of the threads as being arranged in row-major
order.

Q4:

transgraph.par <- function(cls,mat) {
   # to retain original row numbers after splitting matrix
   rownames(mat) <- as.character(1:nrow(mat))
   # distribute matrix chunks to workers
   rowgrps <- splitIndices(nrow(mat),length(cls))
   # <<- assigns to global
   clusterApply(cls,rowgrps,
      function(onegrp) mymat <<- mat[onegrp,])
   # do the work
   clusterExport(cls,'transgraph.ser')
   tmp <- clusterEvalQ(cls,transgraph.ser(mymat))
   Reduce(rbind,tmp)
}

transgraph.ser <- function(adj) {
   outmat <- NULL
   for (i in 1:nrow(adj))  {
      where1s <- which(adj[i,] == 1)
      origrownum <- as.numeric(rownames(adj)[i])
      newrows <- cbind(origrownum,where1s)
      outmat <- rbind(outmat,newrows)
   }
   outmat
}

test <- function() {
   require(parallel)
   a <- rbind(c(1,0,1,1), c(1,0,0,1), c(0,0,0,1), c(0,1,1,0)) 
   cls <- makeCluster(2) 
   transgraph.par(cls,a) 
}

test()