Q1: divergence Q2: atomicAdd() Q3: Lines 11 and 12 must change, to int totth = gridDim.x * blockDim.x * blockDim.y, me = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; Here we are thinking of the threads as being arranged in row-major order. Q4: transgraph.par <- function(cls,mat) { # to retain original row numbers after splitting matrix rownames(mat) <- as.character(1:nrow(mat)) # distribute matrix chunks to workers rowgrps <- splitIndices(nrow(mat),length(cls)) # <<- assigns to global clusterApply(cls,rowgrps, function(onegrp) mymat <<- mat[onegrp,]) # do the work clusterExport(cls,'transgraph.ser') tmp <- clusterEvalQ(cls,transgraph.ser(mymat)) Reduce(rbind,tmp) } transgraph.ser <- function(adj) { outmat <- NULL for (i in 1:nrow(adj)) { where1s <- which(adj[i,] == 1) origrownum <- as.numeric(rownames(adj)[i]) newrows <- cbind(origrownum,where1s) outmat <- rbind(outmat,newrows) } outmat } test <- function() { require(parallel) a <- rbind(c(1,0,1,1), c(1,0,0,1), c(0,0,0,1), c(0,1,1,0)) cls <- makeCluster(2) transgraph.par(cls,a) } test()