\contentsline {chapter}{\numberline {1}Introduction to Parallel Processing}{1}{chapter.1} \contentsline {section}{\numberline {1.1}Overview: Why Use Parallel Systems?}{1}{section.1.1} \contentsline {subsection}{\numberline {1.1.1}Execution Speed}{1}{subsection.1.1.1} \contentsline {subsection}{\numberline {1.1.2}Memory}{2}{subsection.1.1.2} \contentsline {subsection}{\numberline {1.1.3}Distributed Processing}{2}{subsection.1.1.3} \contentsline {subsection}{\numberline {1.1.4}Our Focus Here}{2}{subsection.1.1.4} \contentsline {section}{\numberline {1.2}Parallel Processing Hardware}{3}{section.1.2} \contentsline {subsection}{\numberline {1.2.1}Shared-Memory Systems}{3}{subsection.1.2.1} \contentsline {subsubsection}{\numberline {1.2.1.1}Basic Architecture}{3}{subsubsection.1.2.1.1} \contentsline {subsubsection}{\numberline {1.2.1.2}SMP Systems}{3}{subsubsection.1.2.1.2} \contentsline {subsection}{\numberline {1.2.2}Message-Passing Systems}{4}{subsection.1.2.2} \contentsline {subsubsection}{\numberline {1.2.2.1}Basic Architecture}{4}{subsubsection.1.2.2.1} \contentsline {subsubsection}{\numberline {1.2.2.2}Example: Networks of Workstations (NOWs)}{4}{subsubsection.1.2.2.2} \contentsline {subsection}{\numberline {1.2.3}SIMD}{5}{subsection.1.2.3} \contentsline {section}{\numberline {1.3}Programmer World Views}{5}{section.1.3} \contentsline {subsection}{\numberline {1.3.1}Example: Matrix-Vector Multiply}{5}{subsection.1.3.1} \contentsline {subsection}{\numberline {1.3.2}Shared-Memory}{6}{subsection.1.3.2} \contentsline {subsubsection}{\numberline {1.3.2.1}Programmer View}{6}{subsubsection.1.3.2.1} \contentsline {subsubsection}{\numberline {1.3.2.2}Example: Pthreads Prime Numbers Finder}{7}{subsubsection.1.3.2.2} \contentsline {subsubsection}{\numberline {1.3.2.3}Role of the OS}{12}{subsubsection.1.3.2.3} \contentsline {subsubsection}{\numberline {1.3.2.4}Debugging Threads Programs}{12}{subsubsection.1.3.2.4} \contentsline {subsubsection}{\numberline {1.3.2.5}Higher-Level Threads}{13}{subsubsection.1.3.2.5} \contentsline {subsubsection}{\numberline {1.3.2.6}Example: Sampling Bucket Sort}{13}{subsubsection.1.3.2.6} \contentsline {subsection}{\numberline {1.3.3}Message Passing}{16}{subsection.1.3.3} \contentsline {subsubsection}{\numberline {1.3.3.1}Programmer View}{16}{subsubsection.1.3.3.1} \contentsline {subsubsection}{\numberline {1.3.3.2}Example: MPI Prime Numbers Finder}{16}{subsubsection.1.3.3.2} \contentsline {subsection}{\numberline {1.3.4}Scatter/Gather}{19}{subsection.1.3.4} \contentsline {chapter}{\numberline {2}Recurring Performance Issues}{21}{chapter.2} \contentsline {section}{\numberline {2.1}Communication Bottlenecks}{21}{section.2.1} \contentsline {section}{\numberline {2.2}Load Balancing}{22}{section.2.2} \contentsline {section}{\numberline {2.3}``Embarrassingly Parallel'' Applications}{22}{section.2.3} \contentsline {subsection}{\numberline {2.3.1}What People Mean by ``Embarrassingly Parallel''}{22}{subsection.2.3.1} \contentsline {subsection}{\numberline {2.3.2}Iterative Algorithms}{23}{subsection.2.3.2} \contentsline {section}{\numberline {2.4}Static (But Possibly Random) Task Assignment Typically Better Than Dynamic}{24}{section.2.4} \contentsline {subsection}{\numberline {2.4.1}Example: Matrix-Vector Multiply}{24}{subsection.2.4.1} \contentsline {subsection}{\numberline {2.4.2}(Outline of) Proof That Static Is Typically Better}{25}{subsection.2.4.2} \contentsline {subsection}{\numberline {2.4.3}Load Balance, Revisited}{26}{subsection.2.4.3} \contentsline {subsection}{\numberline {2.4.4}Example: Mutual Web Outlinks}{27}{subsection.2.4.4} \contentsline {subsection}{\numberline {2.4.5}Work Stealing}{28}{subsection.2.4.5} \contentsline {subsection}{\numberline {2.4.6}Timing Example}{28}{subsection.2.4.6} \contentsline {section}{\numberline {2.5}Latency and Bandwidth}{28}{section.2.5} \contentsline {section}{\numberline {2.6}Relative Merits: Performance of Shared-Memory Vs. Message-Passing}{29}{section.2.6} \contentsline {section}{\numberline {2.7}Memory Allocation Issues}{30}{section.2.7} \contentsline {section}{\numberline {2.8}Issues Particular to Shared-Memory Systems}{30}{section.2.8} \contentsline {chapter}{\numberline {3}Shared Memory Parallelism}{31}{chapter.3} \contentsline {section}{\numberline {3.1}What Is Shared?}{31}{section.3.1} \contentsline {section}{\numberline {3.2}Memory Modules}{32}{section.3.2} \contentsline {subsection}{\numberline {3.2.1}Interleaving}{32}{subsection.3.2.1} \contentsline {subsection}{\numberline {3.2.2}Bank Conflicts and Solutions}{33}{subsection.3.2.2} \contentsline {subsection}{\numberline {3.2.3}Example: Code to Implement Padding}{35}{subsection.3.2.3} \contentsline {section}{\numberline {3.3}Interconnection Topologies}{36}{section.3.3} \contentsline {subsection}{\numberline {3.3.1}SMP Systems}{36}{subsection.3.3.1} \contentsline {subsection}{\numberline {3.3.2}NUMA Systems}{37}{subsection.3.3.2} \contentsline {subsection}{\numberline {3.3.3}NUMA Interconnect Topologies}{38}{subsection.3.3.3} \contentsline {subsubsection}{\numberline {3.3.3.1}Crossbar Interconnects}{38}{subsubsection.3.3.3.1} \contentsline {subsubsection}{\numberline {3.3.3.2}Omega (or Delta) Interconnects}{40}{subsubsection.3.3.3.2} \contentsline {subsection}{\numberline {3.3.4}Comparative Analysis}{41}{subsection.3.3.4} \contentsline {subsection}{\numberline {3.3.5}Why Have Memory in Modules?}{42}{subsection.3.3.5} \contentsline {section}{\numberline {3.4}Synchronization Hardware}{43}{section.3.4} \contentsline {subsection}{\numberline {3.4.1}Test-and-Set Instructions}{43}{subsection.3.4.1} \contentsline {subsubsection}{\numberline {3.4.1.1}LOCK Prefix on Intel Processors}{44}{subsubsection.3.4.1.1} \contentsline {subsubsection}{\numberline {3.4.1.2}Example:}{44}{subsubsection.3.4.1.2} \contentsline {subsubsection}{\numberline {3.4.1.3}Locks with More Complex Interconnects}{44}{subsubsection.3.4.1.3} \contentsline {subsection}{\numberline {3.4.2}May Not Need the Latest}{45}{subsection.3.4.2} \contentsline {subsection}{\numberline {3.4.3}Compare-and-Swap Instructions}{45}{subsection.3.4.3} \contentsline {subsection}{\numberline {3.4.4}Fetch-and-Add Instructions}{45}{subsection.3.4.4} \contentsline {section}{\numberline {3.5}Cache Issues}{46}{section.3.5} \contentsline {subsection}{\numberline {3.5.1}Cache Coherency}{46}{subsection.3.5.1} \contentsline {subsection}{\numberline {3.5.2}Example: the MESI Cache Coherency Protocol}{49}{subsection.3.5.2} \contentsline {subsection}{\numberline {3.5.3}The Problem of ``False Sharing''}{51}{subsection.3.5.3} \contentsline {section}{\numberline {3.6}Memory-Access Consistency Policies}{51}{section.3.6} \contentsline {section}{\numberline {3.7}Fetch-and-Add Combining within Interconnects}{54}{section.3.7} \contentsline {section}{\numberline {3.8}Multicore Chips}{54}{section.3.8} \contentsline {section}{\numberline {3.9}Optimal Number of Threads}{54}{section.3.9} \contentsline {section}{\numberline {3.10}Processor Affinity}{55}{section.3.10} \contentsline {section}{\numberline {3.11}Illusion of Shared-Memory through Software}{55}{section.3.11} \contentsline {subsubsection}{\numberline {3.11.0.1}Software Distributed Shared Memory}{55}{subsubsection.3.11.0.1} \contentsline {subsubsection}{\numberline {3.11.0.2}Case Study: JIAJIA}{58}{subsubsection.3.11.0.2} \contentsline {section}{\numberline {3.12}Barrier Implementation}{61}{section.3.12} \contentsline {subsection}{\numberline {3.12.1}A Use-Once Version}{62}{subsection.3.12.1} \contentsline {subsection}{\numberline {3.12.2}An Attempt to Write a Reusable Version}{62}{subsection.3.12.2} \contentsline {subsection}{\numberline {3.12.3}A Correct Version}{63}{subsection.3.12.3} \contentsline {subsection}{\numberline {3.12.4}Refinements}{63}{subsection.3.12.4} \contentsline {subsubsection}{\numberline {3.12.4.1}Use of Wait Operations}{63}{subsubsection.3.12.4.1} \contentsline {subsubsection}{\numberline {3.12.4.2}Parallelizing the Barrier Operation}{65}{subsubsection.3.12.4.2} \contentsline {paragraph}{\numberline {3.12.4.2.1}Tree Barriers}{65}{paragraph.3.12.4.2.1} \contentsline {paragraph}{\numberline {3.12.4.2.2}Butterfly Barriers}{65}{paragraph.3.12.4.2.2} \contentsline {chapter}{\numberline {4}Introduction to OpenMP}{67}{chapter.4} \contentsline {section}{\numberline {4.1}Overview}{67}{section.4.1} \contentsline {section}{\numberline {4.2}Example: Dijkstra Shortest-Path Algorithm}{67}{section.4.2} \contentsline {subsection}{\numberline {4.2.1}The Algorithm}{70}{subsection.4.2.1} \contentsline {subsection}{\numberline {4.2.2}The OpenMP {\tt parallel} Pragma}{70}{subsection.4.2.2} \contentsline {subsection}{\numberline {4.2.3}Scope Issues}{71}{subsection.4.2.3} \contentsline {subsection}{\numberline {4.2.4}The OpenMP {\tt single} Pragma}{72}{subsection.4.2.4} \contentsline {subsection}{\numberline {4.2.5}The OpenMP {\tt barrier} Pragma}{72}{subsection.4.2.5} \contentsline {subsection}{\numberline {4.2.6}Implicit Barriers}{73}{subsection.4.2.6} \contentsline {subsection}{\numberline {4.2.7}The OpenMP {\tt critical} Pragma}{73}{subsection.4.2.7} \contentsline {section}{\numberline {4.3}The OpenMP {\tt for} Pragma}{73}{section.4.3} \contentsline {subsection}{\numberline {4.3.1}Example: Dijkstra with Parallel for Loops}{73}{subsection.4.3.1} \contentsline {subsection}{\numberline {4.3.2}Nested Loops}{76}{subsection.4.3.2} \contentsline {subsection}{\numberline {4.3.3}Controlling the Partitioning of Work to Threads: the schedule Clause}{76}{subsection.4.3.3} \contentsline {subsection}{\numberline {4.3.4}Example: In-Place Matrix Transpose}{78}{subsection.4.3.4} \contentsline {subsection}{\numberline {4.3.5}The OpenMP {\tt reduction} Clause}{79}{subsection.4.3.5} \contentsline {section}{\numberline {4.4}Example: Mandelbrot Set}{80}{section.4.4} \contentsline {section}{\numberline {4.5}The Task Directive}{83}{section.4.5} \contentsline {subsection}{\numberline {4.5.1}Example: Quicksort}{84}{subsection.4.5.1} \contentsline {section}{\numberline {4.6}Other OpenMP Synchronization Issues}{85}{section.4.6} \contentsline {subsection}{\numberline {4.6.1}The OpenMP {\tt atomic} Clause}{85}{subsection.4.6.1} \contentsline {subsection}{\numberline {4.6.2}Memory Consistency and the {\tt flush} Pragma}{86}{subsection.4.6.2} \contentsline {section}{\numberline {4.7}Combining Work-Sharing Constructs}{87}{section.4.7} \contentsline {section}{\numberline {4.8}The Rest of OpenMP}{87}{section.4.8} \contentsline {section}{\numberline {4.9}Compiling, Running and Debugging OpenMP Code}{87}{section.4.9} \contentsline {subsection}{\numberline {4.9.1}Compiling}{87}{subsection.4.9.1} \contentsline {subsection}{\numberline {4.9.2}Running}{88}{subsection.4.9.2} \contentsline {subsection}{\numberline {4.9.3}Debugging}{88}{subsection.4.9.3} \contentsline {section}{\numberline {4.10}Performance}{89}{section.4.10} \contentsline {subsection}{\numberline {4.10.1}The Effect of Problem Size}{89}{subsection.4.10.1} \contentsline {subsection}{\numberline {4.10.2}Some Fine Tuning}{89}{subsection.4.10.2} \contentsline {subsection}{\numberline {4.10.3}OpenMP Internals}{93}{subsection.4.10.3} \contentsline {section}{\numberline {4.11}Example: Root Finding}{94}{section.4.11} \contentsline {section}{\numberline {4.12}Example: Mutual Outlinks}{95}{section.4.12} \contentsline {section}{\numberline {4.13}Example: Transforming an Adjacency Matrix}{97}{section.4.13} \contentsline {section}{\numberline {4.14}Locks with OpenMP}{100}{section.4.14} \contentsline {section}{\numberline {4.15}Other Examples of OpenMP Code in This Book}{100}{section.4.15} \contentsline {chapter}{\numberline {5}Introduction to GPU Programming with CUDA}{101}{chapter.5} \contentsline {section}{\numberline {5.1}Overview}{101}{section.5.1} \contentsline {section}{\numberline {5.2}Example: Calculate Row Sums}{102}{section.5.2} \contentsline {section}{\numberline {5.3}Understanding the Hardware Structure}{106}{section.5.3} \contentsline {subsection}{\numberline {5.3.1}Processing Units}{106}{subsection.5.3.1} \contentsline {subsection}{\numberline {5.3.2}Thread Operation}{106}{subsection.5.3.2} \contentsline {subsubsection}{\numberline {5.3.2.1}SIMT Architecture}{106}{subsubsection.5.3.2.1} \contentsline {subsubsection}{\numberline {5.3.2.2}The Problem of Thread Divergence}{107}{subsubsection.5.3.2.2} \contentsline {subsubsection}{\numberline {5.3.2.3}``OS in Hardware''}{107}{subsubsection.5.3.2.3} \contentsline {subsection}{\numberline {5.3.3}Memory Structure}{108}{subsection.5.3.3} \contentsline {subsubsection}{\numberline {5.3.3.1}Shared and Global Memory}{108}{subsubsection.5.3.3.1} \contentsline {subsubsection}{\numberline {5.3.3.2}Global-Memory Performance Issues}{111}{subsubsection.5.3.3.2} \contentsline {subsubsection}{\numberline {5.3.3.3}Shared-Memory Performance Issues}{112}{subsubsection.5.3.3.3} \contentsline {subsubsection}{\numberline {5.3.3.4}Host/Device Memory Transfer Performance Issues}{112}{subsubsection.5.3.3.4} \contentsline {subsubsection}{\numberline {5.3.3.5}Other Types of Memory}{113}{subsubsection.5.3.3.5} \contentsline {subsection}{\numberline {5.3.4}Threads Hierarchy}{114}{subsection.5.3.4} \contentsline {subsection}{\numberline {5.3.5}What's NOT There}{116}{subsection.5.3.5} \contentsline {section}{\numberline {5.4}Synchronization, Within and Between Blocks}{117}{section.5.4} \contentsline {section}{\numberline {5.5}More on the Blocks/Threads Tradeoff}{118}{section.5.5} \contentsline {section}{\numberline {5.6}Hardware Requirements, Installation, Compilation, Debugging}{118}{section.5.6} \contentsline {section}{\numberline {5.7}Example: Improving the Row Sums Program}{120}{section.5.7} \contentsline {section}{\numberline {5.8}Example: Finding the Mean Number of Mutual Outlinks}{122}{section.5.8} \contentsline {section}{\numberline {5.9}Example: Finding Prime Numbers}{123}{section.5.9} \contentsline {section}{\numberline {5.10}Example: Finding Cumulative Sums}{126}{section.5.10} \contentsline {section}{\numberline {5.11}Example: Transforming an Adjacency Matrix}{127}{section.5.11} \contentsline {section}{\numberline {5.12}Error Checking}{130}{section.5.12} \contentsline {section}{\numberline {5.13}Loop Unrolling}{131}{section.5.13} \contentsline {section}{\numberline {5.14}Short Vectors}{131}{section.5.14} \contentsline {section}{\numberline {5.15}The New Generation}{132}{section.5.15} \contentsline {section}{\numberline {5.16}CUDA from a Higher Level}{132}{section.5.16} \contentsline {subsection}{\numberline {5.16.1}CUBLAS}{132}{subsection.5.16.1} \contentsline {subsubsection}{\numberline {5.16.1.1}Example: Row Sums Once Again}{133}{subsubsection.5.16.1.1} \contentsline {subsection}{\numberline {5.16.2}Thrust}{135}{subsection.5.16.2} \contentsline {subsection}{\numberline {5.16.3}CUDPP}{135}{subsection.5.16.3} \contentsline {subsection}{\numberline {5.16.4}CUFFT}{135}{subsection.5.16.4} \contentsline {section}{\numberline {5.17}Other CUDA Examples in This Book}{135}{section.5.17} \contentsline {chapter}{\numberline {6}Introduction to Thrust Programming}{137}{chapter.6} \contentsline {section}{\numberline {6.1}Compiling Thrust Code}{137}{section.6.1} \contentsline {subsection}{\numberline {6.1.1}Compiling to CUDA}{137}{subsection.6.1.1} \contentsline {subsection}{\numberline {6.1.2}Compiling to OpenMP}{137}{subsection.6.1.2} \contentsline {section}{\numberline {6.2}Example: Counting the Number of Unique Values in an Array}{138}{section.6.2} \contentsline {section}{\numberline {6.3}Example: A Plain-C Wrapper for Thrust sort()}{142}{section.6.3} \contentsline {section}{\numberline {6.4}Example: Calculating Percentiles in an Array}{143}{section.6.4} \contentsline {section}{\numberline {6.5}Mixing Thrust and CUDA Code}{144}{section.6.5} \contentsline {section}{\numberline {6.6}Example: Doubling Every k$^{th}$ Element of an Array}{144}{section.6.6} \contentsline {section}{\numberline {6.7}Scatter and Gather Operations}{146}{section.6.7} \contentsline {subsection}{\numberline {6.7.1}Example: Matrix Transpose}{147}{subsection.6.7.1} \contentsline {section}{\numberline {6.8}Prefix Scan}{148}{section.6.8} \contentsline {section}{\numberline {6.9}Advanced (``Fancy'') Iterators}{149}{section.6.9} \contentsline {subsection}{\numberline {6.9.1}Example: Matrix Transpose Again}{149}{subsection.6.9.1} \contentsline {section}{\numberline {6.10}A Timing Comparison}{151}{section.6.10} \contentsline {section}{\numberline {6.11}Example: Transforming an Adjacency Matrix}{155}{section.6.11} \contentsline {section}{\numberline {6.12}More on Use of Thrust for a CUDA Back End}{157}{section.6.12} \contentsline {subsection}{\numberline {6.12.1}Synchronicity}{157}{subsection.6.12.1} \contentsline {section}{\numberline {6.13}Error Messages}{157}{section.6.13} \contentsline {section}{\numberline {6.14}Other Examples of Thrust Code in This Book}{158}{section.6.14} \contentsline {chapter}{\numberline {7}Message Passing Systems}{159}{chapter.7} \contentsline {section}{\numberline {7.1}Overview}{159}{section.7.1} \contentsline {section}{\numberline {7.2}A Historical Example: Hypercubes}{160}{section.7.2} \contentsline {subsection}{\numberline {7.2.1}Definitions}{160}{subsection.7.2.1} \contentsline {section}{\numberline {7.3}Networks of Workstations (NOWs)}{162}{section.7.3} \contentsline {subsection}{\numberline {7.3.1}The Network Is Literally the Weakest Link}{162}{subsection.7.3.1} \contentsline {subsection}{\numberline {7.3.2}Other Issues}{163}{subsection.7.3.2} \contentsline {section}{\numberline {7.4}Scatter/Gather Operations}{163}{section.7.4} \contentsline {chapter}{\numberline {8}Introduction to MPI}{165}{chapter.8} \contentsline {section}{\numberline {8.1}Overview}{165}{section.8.1} \contentsline {subsection}{\numberline {8.1.1}History}{165}{subsection.8.1.1} \contentsline {subsection}{\numberline {8.1.2}Structure and Execution}{166}{subsection.8.1.2} \contentsline {subsection}{\numberline {8.1.3}Implementations}{166}{subsection.8.1.3} \contentsline {subsection}{\numberline {8.1.4}Performance Issues}{166}{subsection.8.1.4} \contentsline {section}{\numberline {8.2}Review of Earlier Example}{167}{section.8.2} \contentsline {section}{\numberline {8.3}Example: Dijkstra Algorithm}{167}{section.8.3} \contentsline {subsection}{\numberline {8.3.1}The Algorithm}{167}{subsection.8.3.1} \contentsline {subsection}{\numberline {8.3.2}The MPI Code}{168}{subsection.8.3.2} \contentsline {subsection}{\numberline {8.3.3}Introduction to MPI APIs}{171}{subsection.8.3.3} \contentsline {subsubsection}{\numberline {8.3.3.1}MPI\_Init() and MPI\_Finalize()}{171}{subsubsection.8.3.3.1} \contentsline {subsubsection}{\numberline {8.3.3.2}MPI\_Comm\_size() and MPI\_Comm\_rank()}{171}{subsubsection.8.3.3.2} \contentsline {subsubsection}{\numberline {8.3.3.3}MPI\_Send()}{172}{subsubsection.8.3.3.3} \contentsline {subsubsection}{\numberline {8.3.3.4}MPI\_Recv()}{173}{subsubsection.8.3.3.4} \contentsline {section}{\numberline {8.4}Example: Removing 0s from an Array}{174}{section.8.4} \contentsline {section}{\numberline {8.5}Debugging MPI Code}{175}{section.8.5} \contentsline {section}{\numberline {8.6}Collective Communications}{176}{section.8.6} \contentsline {subsection}{\numberline {8.6.1}Example: Refined Dijkstra Code}{176}{subsection.8.6.1} \contentsline {subsection}{\numberline {8.6.2}MPI\_Bcast()}{179}{subsection.8.6.2} \contentsline {subsection}{\numberline {8.6.3}MPI\_Reduce()/MPI\_Allreduce()}{180}{subsection.8.6.3} \contentsline {subsection}{\numberline {8.6.4}MPI\_Gather()/MPI\_Allgather()}{181}{subsection.8.6.4} \contentsline {subsection}{\numberline {8.6.5}The MPI\_Scatter()}{181}{subsection.8.6.5} \contentsline {subsection}{\numberline {8.6.6}Example: Count the Number of Edges in a Directed Graph}{182}{subsection.8.6.6} \contentsline {subsection}{\numberline {8.6.7}Example: Cumulative Sums}{182}{subsection.8.6.7} \contentsline {subsection}{\numberline {8.6.8}Example: an MPI Solution to the Mutual Outlinks Problem}{184}{subsection.8.6.8} \contentsline {subsection}{\numberline {8.6.9}The MPI\_Barrier()}{185}{subsection.8.6.9} \contentsline {subsection}{\numberline {8.6.10}Creating Communicators}{186}{subsection.8.6.10} \contentsline {section}{\numberline {8.7}Buffering, Synchrony and Related Issues}{186}{section.8.7} \contentsline {subsection}{\numberline {8.7.1}Buffering, Etc.}{187}{subsection.8.7.1} \contentsline {subsection}{\numberline {8.7.2}Safety}{188}{subsection.8.7.2} \contentsline {subsection}{\numberline {8.7.3}Living Dangerously}{189}{subsection.8.7.3} \contentsline {subsection}{\numberline {8.7.4}Safe Exchange Operations}{189}{subsection.8.7.4} \contentsline {section}{\numberline {8.8}Use of MPI from Other Languages}{189}{section.8.8} \contentsline {section}{\numberline {8.9}Other MPI Examples in This Book}{189}{section.8.9} \contentsline {chapter}{\numberline {9}Cloud Computing}{191}{chapter.9} \contentsline {section}{\numberline {9.1}Platforms and Modes}{192}{section.9.1} \contentsline {section}{\numberline {9.2}Overview of Operations}{192}{section.9.2} \contentsline {section}{\numberline {9.3}Role of Keys}{193}{section.9.3} \contentsline {section}{\numberline {9.4}Hadoop Streaming}{193}{section.9.4} \contentsline {section}{\numberline {9.5}Example: Word Count}{193}{section.9.5} \contentsline {section}{\numberline {9.6}Example: Maximum Air Temperature by Year}{194}{section.9.6} \contentsline {section}{\numberline {9.7}Role of Disk Files}{195}{section.9.7} \contentsline {section}{\numberline {9.8}The Hadoop Shell}{196}{section.9.8} \contentsline {section}{\numberline {9.9}Running Hadoop}{196}{section.9.9} \contentsline {section}{\numberline {9.10}Example: Transforming an Adjacency Graph}{197}{section.9.10} \contentsline {section}{\numberline {9.11}Example: Identifying Outliers}{200}{section.9.11} \contentsline {section}{\numberline {9.12}Debugging Hadoop Streaming Programs}{203}{section.9.12} \contentsline {section}{\numberline {9.13}It's a Lot More Than Just Programming}{204}{section.9.13} \contentsline {chapter}{\numberline {10}Introduction to Parallel R}{205}{chapter.10} \contentsline {section}{\numberline {10.1}Why Is R Featured in This Book?}{205}{section.10.1} \contentsline {section}{\numberline {10.2}R and Embarrassing Parallel Problems}{206}{section.10.2} \contentsline {section}{\numberline {10.3}Some Parallel R Packages}{206}{section.10.3} \contentsline {section}{\numberline {10.4}Installing and Loading the Packages}{207}{section.10.4} \contentsline {section}{\numberline {10.5}The R snow Package}{207}{section.10.5} \contentsline {subsection}{\numberline {10.5.1}Usage}{208}{subsection.10.5.1} \contentsline {subsection}{\numberline {10.5.2}Example: Matrix-Vector Multiply, Using parApply()}{208}{subsection.10.5.2} \contentsline {subsection}{\numberline {10.5.3}Other snow Functions: clusterApply(), clusterCall() Etc.}{210}{subsection.10.5.3} \contentsline {subsection}{\numberline {10.5.4}Example: Parallel Sum}{212}{subsection.10.5.4} \contentsline {subsection}{\numberline {10.5.5}Example: Inversion of Block-Diagonal Matrices}{214}{subsection.10.5.5} \contentsline {subsection}{\numberline {10.5.6}Example: Mutual Outlinks}{216}{subsection.10.5.6} \contentsline {subsection}{\numberline {10.5.7}Example: Transforming an Adjacency Matrix}{218}{subsection.10.5.7} \contentsline {subsection}{\numberline {10.5.8}Example: Setting Node IDs and Notification of Cluster Size}{220}{subsection.10.5.8} \contentsline {subsection}{\numberline {10.5.9}Shutting Down a Cluster}{221}{subsection.10.5.9} \contentsline {section}{\numberline {10.6}The multicore Package}{222}{section.10.6} \contentsline {subsection}{\numberline {10.6.1}Example: Transforming an Adjacency Matrix, multicore Version}{223}{subsection.10.6.1} \contentsline {section}{\numberline {10.7}Rdsm}{224}{section.10.7} \contentsline {subsection}{\numberline {10.7.1}Example: Inversion of Block-Diagonal Matrices}{224}{subsection.10.7.1} \contentsline {subsection}{\numberline {10.7.2}Example: Web Probe}{225}{subsection.10.7.2} \contentsline {subsection}{\numberline {10.7.3}The bigmemory Package}{227}{subsection.10.7.3} \contentsline {section}{\numberline {10.8}R with GPUs}{227}{section.10.8} \contentsline {subsection}{\numberline {10.8.1}Installation}{227}{subsection.10.8.1} \contentsline {subsection}{\numberline {10.8.2}The gputools Package}{228}{subsection.10.8.2} \contentsline {subsection}{\numberline {10.8.3}The rgpu Package}{228}{subsection.10.8.3} \contentsline {section}{\numberline {10.9}Parallelism Via Calling C from R}{229}{section.10.9} \contentsline {subsection}{\numberline {10.9.1}Calling C from R}{229}{subsection.10.9.1} \contentsline {subsection}{\numberline {10.9.2}Example: Extracting Subdiagonals of a Matrix}{230}{subsection.10.9.2} \contentsline {subsection}{\numberline {10.9.3}Calling C OpenMP Code from R}{231}{subsection.10.9.3} \contentsline {subsection}{\numberline {10.9.4}Calling CUDA Code from R}{231}{subsection.10.9.4} \contentsline {subsection}{\numberline {10.9.5}Example: Mutual Outlinks}{232}{subsection.10.9.5} \contentsline {section}{\numberline {10.10}Debugging R Applications}{233}{section.10.10} \contentsline {subsection}{\numberline {10.10.1}Text Editors}{234}{subsection.10.10.1} \contentsline {subsection}{\numberline {10.10.2}IDEs}{234}{subsection.10.10.2} \contentsline {subsection}{\numberline {10.10.3}The Problem of Lack of a Terminal}{234}{subsection.10.10.3} \contentsline {subsection}{\numberline {10.10.4}Debugging C Called from R}{235}{subsection.10.10.4} \contentsline {section}{\numberline {10.11}Other R Examples in This Book}{235}{section.10.11} \contentsline {chapter}{\numberline {11}The Parallel Prefix Problem}{237}{chapter.11} \contentsline {section}{\numberline {11.1}Example: Permutations}{237}{section.11.1} \contentsline {section}{\numberline {11.2}General Strategies for Parallel Scan Computation}{238}{section.11.2} \contentsline {section}{\numberline {11.3}Implementations}{241}{section.11.3} \contentsline {section}{\numberline {11.4}Example: Parallel Prefix, Run-Length Decoding in OpenMP}{241}{section.11.4} \contentsline {section}{\numberline {11.5}Example: Run-Length Decompression in Thrust}{243}{section.11.5} \contentsline {chapter}{\numberline {12}Introduction to Parallel Matrix Operations}{245}{chapter.12} \contentsline {section}{\numberline {12.1}``We're Not in Physicsland Anymore, Toto''}{245}{section.12.1} \contentsline {section}{\numberline {12.2}Partitioned Matrices}{245}{section.12.2} \contentsline {section}{\numberline {12.3}Parallel Matrix Multiplication}{247}{section.12.3} \contentsline {subsection}{\numberline {12.3.1}Message-Passing Case}{247}{subsection.12.3.1} \contentsline {subsubsection}{\numberline {12.3.1.1}Fox's Algorithm}{248}{subsubsection.12.3.1.1} \contentsline {subsubsection}{\numberline {12.3.1.2}Performance Issues}{249}{subsubsection.12.3.1.2} \contentsline {subsection}{\numberline {12.3.2}Shared-Memory Case}{249}{subsection.12.3.2} \contentsline {subsubsection}{\numberline {12.3.2.1}Example: Matrix Multiply in OpenMP}{249}{subsubsection.12.3.2.1} \contentsline {subsubsection}{\numberline {12.3.2.2}Example: Matrix Multiply in CUDA}{250}{subsubsection.12.3.2.2} \contentsline {section}{\numberline {12.4}Finding Powers of Matrices}{253}{section.12.4} \contentsline {subsection}{\numberline {12.4.1}Example: Graph Connectedness}{253}{subsection.12.4.1} \contentsline {subsection}{\numberline {12.4.2}Example: Fibonacci Numbers}{254}{subsection.12.4.2} \contentsline {subsection}{\numberline {12.4.3}Example: Matrix Inversion}{254}{subsection.12.4.3} \contentsline {subsection}{\numberline {12.4.4}Parallel Computation}{255}{subsection.12.4.4} \contentsline {section}{\numberline {12.5}Solving Systems of Linear Equations}{255}{section.12.5} \contentsline {subsection}{\numberline {12.5.1}Gaussian Elimination}{256}{subsection.12.5.1} \contentsline {subsection}{\numberline {12.5.2}Example: Gaussian Elimination in CUDA}{257}{subsection.12.5.2} \contentsline {subsection}{\numberline {12.5.3}The Jacobi Algorithm}{258}{subsection.12.5.3} \contentsline {subsection}{\numberline {12.5.4}Example: OpenMP Implementation of the Jacobi Algorithm}{259}{subsection.12.5.4} \contentsline {subsection}{\numberline {12.5.5}Example: R/gputools Implementation of Jacobi}{260}{subsection.12.5.5} \contentsline {section}{\numberline {12.6}Eigenvalues and Eigenvectors}{260}{section.12.6} \contentsline {subsection}{\numberline {12.6.1}The Power Method}{260}{subsection.12.6.1} \contentsline {subsection}{\numberline {12.6.2}Parallel Computation}{261}{subsection.12.6.2} \contentsline {section}{\numberline {12.7}Sparse Matrices}{262}{section.12.7} \contentsline {section}{\numberline {12.8}Libraries}{263}{section.12.8} \contentsline {chapter}{\numberline {13}Introduction to Parallel Sorting}{265}{chapter.13} \contentsline {section}{\numberline {13.1}Quicksort}{265}{section.13.1} \contentsline {subsection}{\numberline {13.1.1}The Separation Process}{265}{subsection.13.1.1} \contentsline {subsection}{\numberline {13.1.2}Example: OpenMP Quicksort}{267}{subsection.13.1.2} \contentsline {subsection}{\numberline {13.1.3}Hyperquicksort}{268}{subsection.13.1.3} \contentsline {section}{\numberline {13.2}Mergesorts}{269}{section.13.2} \contentsline {subsection}{\numberline {13.2.1}Sequential Form}{269}{subsection.13.2.1} \contentsline {subsection}{\numberline {13.2.2}Shared-Memory Mergesort}{269}{subsection.13.2.2} \contentsline {subsection}{\numberline {13.2.3}Message Passing Mergesort on a Tree Topology}{269}{subsection.13.2.3} \contentsline {subsection}{\numberline {13.2.4}Compare-Exchange Operations}{270}{subsection.13.2.4} \contentsline {subsection}{\numberline {13.2.5}Bitonic Mergesort}{270}{subsection.13.2.5} \contentsline {section}{\numberline {13.3}The Bubble Sort and Its Cousins}{272}{section.13.3} \contentsline {subsection}{\numberline {13.3.1}The Much-Maligned Bubble Sort}{272}{subsection.13.3.1} \contentsline {subsection}{\numberline {13.3.2}A Popular Variant: Odd-Even Transposition}{273}{subsection.13.3.2} \contentsline {subsection}{\numberline {13.3.3}Example: CUDA Implementation of Odd/Even Transposition Sort}{273}{subsection.13.3.3} \contentsline {section}{\numberline {13.4}Shearsort}{275}{section.13.4} \contentsline {section}{\numberline {13.5}Bucket Sort with Sampling}{275}{section.13.5} \contentsline {section}{\numberline {13.6}Radix Sort}{279}{section.13.6} \contentsline {section}{\numberline {13.7}Enumeration Sort}{279}{section.13.7} \contentsline {chapter}{\numberline {14}Parallel Computation for Audio and Image Processing}{281}{chapter.14} \contentsline {section}{\numberline {14.1}General Principles}{281}{section.14.1} \contentsline {subsection}{\numberline {14.1.1}One-Dimensional Fourier Series}{281}{subsection.14.1.1} \contentsline {subsection}{\numberline {14.1.2}Two-Dimensional Fourier Series}{285}{subsection.14.1.2} \contentsline {section}{\numberline {14.2}Discrete Fourier Transforms}{285}{section.14.2} \contentsline {subsection}{\numberline {14.2.1}One-Dimensional Data}{286}{subsection.14.2.1} \contentsline {subsection}{\numberline {14.2.2}Inversion}{287}{subsection.14.2.2} \contentsline {subsubsection}{\numberline {14.2.2.1}Alternate Formulation}{288}{subsubsection.14.2.2.1} \contentsline {subsection}{\numberline {14.2.3}Two-Dimensional Data}{288}{subsection.14.2.3} \contentsline {section}{\numberline {14.3}Parallel Computation of Discrete Fourier Transforms}{289}{section.14.3} \contentsline {subsection}{\numberline {14.3.1}The Fast Fourier Transform}{289}{subsection.14.3.1} \contentsline {subsection}{\numberline {14.3.2}A Matrix Approach}{290}{subsection.14.3.2} \contentsline {subsection}{\numberline {14.3.3}Parallelizing Computation of the Inverse Transform}{290}{subsection.14.3.3} \contentsline {subsection}{\numberline {14.3.4}Parallelizing Computation of the Two-Dimensional Transform}{290}{subsection.14.3.4} \contentsline {section}{\numberline {14.4}Available FFT Software}{291}{section.14.4} \contentsline {subsection}{\numberline {14.4.1}R}{291}{subsection.14.4.1} \contentsline {subsection}{\numberline {14.4.2}CUFFT}{291}{subsection.14.4.2} \contentsline {subsection}{\numberline {14.4.3}FFTW}{291}{subsection.14.4.3} \contentsline {section}{\numberline {14.5}Applications to Image Processing}{292}{section.14.5} \contentsline {subsection}{\numberline {14.5.1}Smoothing}{292}{subsection.14.5.1} \contentsline {subsection}{\numberline {14.5.2}Example: Audio Smoothing in R}{292}{subsection.14.5.2} \contentsline {subsection}{\numberline {14.5.3}Edge Detection}{293}{subsection.14.5.3} \contentsline {section}{\numberline {14.6}R Access to Sound and Image Files}{294}{section.14.6} \contentsline {section}{\numberline {14.7}Keeping the Pixel Intensities in the Proper Range}{294}{section.14.7} \contentsline {section}{\numberline {14.8}Does the Function g() Really Have to Be Repeating?}{295}{section.14.8} \contentsline {section}{\numberline {14.9}Vector Space Issues (optional section)}{295}{section.14.9} \contentsline {section}{\numberline {14.10}Bandwidth: How to Read the \textit {San Francisco Chronicle} Business Page (optional section)}{297}{section.14.10} \contentsline {chapter}{\numberline {15}Parallel Computation in Statistics/Data Mining}{299}{chapter.15} \contentsline {section}{\numberline {15.1}Itemset Analysis}{299}{section.15.1} \contentsline {subsection}{\numberline {15.1.1}What Is It?}{299}{subsection.15.1.1} \contentsline {subsection}{\numberline {15.1.2}The Market Basket Problem}{300}{subsection.15.1.2} \contentsline {subsection}{\numberline {15.1.3}Serial Algorithms}{301}{subsection.15.1.3} \contentsline {subsection}{\numberline {15.1.4}Parallelizing the Apriori Algorithm}{302}{subsection.15.1.4} \contentsline {section}{\numberline {15.2}Probability Density Estimation}{302}{section.15.2} \contentsline {subsection}{\numberline {15.2.1}Kernel-Based Density Estimation}{303}{subsection.15.2.1} \contentsline {subsection}{\numberline {15.2.2}Histogram Computation for Images}{306}{subsection.15.2.2} \contentsline {section}{\numberline {15.3}Clustering}{307}{section.15.3} \contentsline {subsection}{\numberline {15.3.1}Example: k-Means Clustering in R}{309}{subsection.15.3.1} \contentsline {section}{\numberline {15.4}Principal Component Analysis (PCA)}{310}{section.15.4} \contentsline {section}{\numberline {15.5}Monte Carlo Simulation}{311}{section.15.5} \contentsline {chapter}{\numberline {16}Parallel Python Threads and Multiprocessing Modules}{313}{chapter.16} \contentsline {section}{\numberline {16.1}The Python Threads and Multiprocessing Modules}{313}{section.16.1} \contentsline {subsection}{\numberline {16.1.1}Python Threads Modules}{313}{subsection.16.1.1} \contentsline {subsubsection}{\numberline {16.1.1.1}The {\tt thread} Module}{314}{subsubsection.16.1.1.1} \contentsline {subsubsection}{\numberline {16.1.1.2}The {\tt threading} Module}{323}{subsubsection.16.1.1.2} \contentsline {subsection}{\numberline {16.1.2}Condition Variables}{327}{subsection.16.1.2} \contentsline {subsubsection}{\numberline {16.1.2.1}General Ideas}{327}{subsubsection.16.1.2.1} \contentsline {subsubsection}{\numberline {16.1.2.2}Other {\tt threading} Classes}{328}{subsubsection.16.1.2.2} \contentsline {subsection}{\numberline {16.1.3}Threads Internals}{328}{subsection.16.1.3} \contentsline {subsubsection}{\numberline {16.1.3.1}Kernel-Level Thread Managers}{328}{subsubsection.16.1.3.1} \contentsline {subsubsection}{\numberline {16.1.3.2}User-Level Thread Managers}{329}{subsubsection.16.1.3.2} \contentsline {subsubsection}{\numberline {16.1.3.3}Comparison}{329}{subsubsection.16.1.3.3} \contentsline {subsubsection}{\numberline {16.1.3.4}The Python Thread Manager}{329}{subsubsection.16.1.3.4} \contentsline {subsubsection}{\numberline {16.1.3.5}The GIL}{330}{subsubsection.16.1.3.5} \contentsline {subsubsection}{\numberline {16.1.3.6}Implications for Randomness and Need for Locks}{331}{subsubsection.16.1.3.6} \contentsline {subsection}{\numberline {16.1.4}The {\tt multiprocessing} Module}{331}{subsection.16.1.4} \contentsline {subsection}{\numberline {16.1.5}The {\tt Queue} Module for Threads and Multiprocessing}{334}{subsection.16.1.5} \contentsline {subsection}{\numberline {16.1.6}Debugging Threaded and Multiprocessing Python Programs}{337}{subsection.16.1.6} \contentsline {section}{\numberline {16.2}Using Python with MPI}{338}{section.16.2} \contentsline {subsection}{\numberline {16.2.1}Using PDB to Debug Threaded Programs}{339}{subsection.16.2.1} \contentsline {subsection}{\numberline {16.2.2}RPDB2 and Winpdb}{341}{subsection.16.2.2} \contentsline {chapter}{\numberline {A}Miscellaneous Systems Issues}{343}{appendix.A} \contentsline {section}{\numberline {A.1}Timesharing}{343}{section.A.1} \contentsline {subsection}{\numberline {A.1.1}Many Processes, Taking Turns}{343}{subsection.A.1.1} \contentsline {section}{\numberline {A.2}Memory Hierarchies}{345}{section.A.2} \contentsline {subsection}{\numberline {A.2.1}Cache Memory}{345}{subsection.A.2.1} \contentsline {subsection}{\numberline {A.2.2}Virtual Memory}{345}{subsection.A.2.2} \contentsline {subsubsection}{\numberline {A.2.2.1}Make Sure You Understand the Goals}{345}{subsubsection.A.2.2.1} \contentsline {subsubsection}{\numberline {A.2.2.2}How It Works}{346}{subsubsection.A.2.2.2} \contentsline {subsection}{\numberline {A.2.3}Performance Issues}{347}{subsection.A.2.3} \contentsline {section}{\numberline {A.3}Array Issues}{347}{section.A.3} \contentsline {subsection}{\numberline {A.3.1}Storage}{347}{subsection.A.3.1} \contentsline {subsection}{\numberline {A.3.2}Subarrays}{348}{subsection.A.3.2} \contentsline {subsection}{\numberline {A.3.3}Memory Allocation}{348}{subsection.A.3.3} \contentsline {chapter}{\numberline {B}Review of Matrix Algebra}{351}{appendix.B} \contentsline {section}{\numberline {B.1}Terminology and Notation}{351}{section.B.1} \contentsline {subsection}{\numberline {B.1.1}Matrix Addition and Multiplication}{352}{subsection.B.1.1} \contentsline {section}{\numberline {B.2}Matrix Transpose}{353}{section.B.2} \contentsline {section}{\numberline {B.3}Linear Independence}{353}{section.B.3} \contentsline {section}{\numberline {B.4}Determinants}{354}{section.B.4} \contentsline {section}{\numberline {B.5}Matrix Inverse}{354}{section.B.5} \contentsline {section}{\numberline {B.6}Eigenvalues and Eigenvectors}{354}{section.B.6} \contentsline {chapter}{\numberline {C}R Quick Start}{357}{appendix.C} \contentsline {section}{\numberline {C.1}Correspondences}{357}{section.C.1} \contentsline {section}{\numberline {C.2}Starting R}{358}{section.C.2} \contentsline {section}{\numberline {C.3}First Sample Programming Session}{358}{section.C.3} \contentsline {section}{\numberline {C.4}Second Sample Programming Session}{361}{section.C.4} \contentsline {section}{\numberline {C.5}Third Sample Programming Session}{363}{section.C.5} \contentsline {section}{\numberline {C.6}Complex Numbers}{363}{section.C.6} \contentsline {section}{\numberline {C.7}Other Sources for Learning R}{364}{section.C.7} \contentsline {section}{\numberline {C.8}Online Help}{364}{section.C.8} \contentsline {section}{\numberline {C.9}Debugging in R}{365}{section.C.9} \contentsline {chapter}{\numberline {D}Introduction to Python}{367}{appendix.D} \contentsline {section}{\numberline {D.1}A 5-Minute Introductory Example}{367}{section.D.1} \contentsline {subsection}{\numberline {D.1.1}Example Program Code}{367}{subsection.D.1.1} \contentsline {subsection}{\numberline {D.1.2}Python Lists}{368}{subsection.D.1.2} \contentsline {subsection}{\numberline {D.1.3}Loops}{368}{subsection.D.1.3} \contentsline {subsection}{\numberline {D.1.4}Python Block Definition}{369}{subsection.D.1.4} \contentsline {subsection}{\numberline {D.1.5}Python Also Offers an Interactive Mode}{371}{subsection.D.1.5} \contentsline {subsection}{\numberline {D.1.6}Python As a Calculator}{372}{subsection.D.1.6} \contentsline {section}{\numberline {D.2}A 10-Minute Introductory Example}{373}{section.D.2} \contentsline {subsection}{\numberline {D.2.1}Example Program Code}{373}{subsection.D.2.1} \contentsline {subsection}{\numberline {D.2.2}Command-Line Arguments}{374}{subsection.D.2.2} \contentsline {subsection}{\numberline {D.2.3}Introduction to File Manipulation}{375}{subsection.D.2.3} \contentsline {subsection}{\numberline {D.2.4}Lack of Declaration}{375}{subsection.D.2.4} \contentsline {subsection}{\numberline {D.2.5}Locals Vs. Globals}{376}{subsection.D.2.5} \contentsline {subsection}{\numberline {D.2.6}A Couple of Built-In Functions}{376}{subsection.D.2.6} \contentsline {section}{\numberline {D.3}Types of Variables/Values}{376}{section.D.3} \contentsline {section}{\numberline {D.4}String Versus Numerical Values}{377}{section.D.4} \contentsline {section}{\numberline {D.5}Sequences}{377}{section.D.5} \contentsline {subsection}{\numberline {D.5.1}Lists (Quasi-Arrays)}{378}{subsection.D.5.1} \contentsline {subsection}{\numberline {D.5.2}Tuples}{380}{subsection.D.5.2} \contentsline {subsection}{\numberline {D.5.3}Strings}{380}{subsection.D.5.3} \contentsline {subsubsection}{\numberline {D.5.3.1}Strings As Turbocharged Tuples}{381}{subsubsection.D.5.3.1} \contentsline {subsubsection}{\numberline {D.5.3.2}Formatted String Manipulation}{382}{subsubsection.D.5.3.2} \contentsline {section}{\numberline {D.6}Dictionaries (Hashes)}{383}{section.D.6} \contentsline {section}{\numberline {D.7}Extended Example: Computing Final Grades}{385}{section.D.7}