#4 add lectures for optimisation 2, plus openmp exercise

SABS-R3 · Oct 8, 2019 · 7bf1987 · 7bf1987
1 parent 2245a35
commit 7bf1987
Show file tree

Hide file tree

Showing 7 changed files with 1,380 additions and 25 deletions.
diff --git a/13_optimization_2/Makefile b/13_optimization_2/Makefile
@@ -0,0 +1,24 @@
+all: lectures_01_pybind11 practical
+
+lectures_dir = lectures
+practicals_dir = practicals
+projects_dir = projects
+publish_dir = publish
+
+reveal_command = "pandoc -f markdown$(extensions) -t revealjs -s ${reveal_options} $^ -o ${publish_dir}/$@.html"
+syntax =--highlight-style pygments_custom.theme
+
+intro:${lectures_dir}/intro.md
+	pandoc -f markdown$(extensions) -t beamer ${syntax} $^ -V theme:metropolis -V aspectratio:169 -o ${publish_dir}/$@.pdf
+
+lectures%:${lectures_dir}/lectures%.md
+	pandoc -f markdown$(extensions) -t beamer ${syntax} $^ -V theme:metropolis -V aspectratio:169 -o ${publish_dir}/$@.pdf
+
+project%:${projects_dir}/project%.md
+	pandoc  -f markdown$(extensions) $(pdf_template) $^ -o ${publish_dir}/$@.pdf
+
+practical:${practicals_dir}/practical.md
+	pandoc  -f markdown$(extensions) $(pdf_template) $^ -o ${publish_dir}/$@.pdf
+
+clean:
+	rm ${publish_dir}/*.pdf
diff --git a/13_optimization_2/lectures/figs/Fork_join.svg b/13_optimization_2/lectures/figs/Fork_join.svg
diff --git a/13_optimization_2/lectures/figs/OpenMP_language_extensions.svg b/13_optimization_2/lectures/figs/OpenMP_language_extensions.svg
diff --git a/13_optimization_2/lectures/lectures_01_pybind11.md b/13_optimization_2/lectures/lectures_01_pybind11.md
@@ -1,6 +1,9 @@
-% Optimisation 3 - Wrapping C++ with pybind11
-% Martin Robinson
-% Oct 2019
+---
+title: Optimisation 3 - Wrapping C++ with pybind11
+author: Martin Robinson
+date: Oct 2019
+urlcolor: blue
+---
 
 # Why wrap C++
 
@@ -56,8 +59,8 @@ PYBIND11_MODULE(example, m) {
   use the CMake build system
 
 ```bash
-$ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o 
-example`python3-config --extension-suffix`
+$ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` 
+example.cpp -o example`python3-config --extension-suffix`
 ```
 
 # Using from Python
@@ -175,9 +178,11 @@ PYBIND11_MODULE(example, m) {
 ```cpp
 void print_dict(py::dict dict) {
     /* Easily interact with Python types */
-    for (auto item : dict)
+    for (auto item : dict) {
         std::cout << "key=" << std::string(py::str(item.first)) << ", "
-                  << "value=" << std::string(py::str(item.second)) << std::endl;
+                  << "value=" << std::string(py::str(item.second)) 
+                  << std::endl;
+    }
 }
 ```
 
@@ -202,8 +207,9 @@ double norm(py::array_t<double> input, const int p) {
 
 ```cpp
 double norm(py::array_t<double> input, const int p) {
-    auto buf = input.unchecked<1>(); // input must have ndim = 1; can be non-writeable
-                                     // use input.mutable_unchecked for writeable access
+    // input must have ndim = 1; can be non-writeable
+    // use input.mutable_unchecked for writeable access
+    auto buf = input.unchecked<1>(); 
     double result = 0.0;
     for (size_t i = 0; i < buf.shape[0]; ++i) {
       result += std::pow(buf[i],p);
@@ -242,19 +248,3 @@ PYBIND11_MAKE_OPAQUE(std::vector<int>);
 - This has been a summary of the PyBind11 features you will need for the exercies
 - See the [documentation](https://pybind11.readthedocs.io/en/master/index.html) for many 
   more details, exaplanation and additional features
-
-
-
-
-  
-
-
-
-
-
-
-
-
-
-
-
diff --git a/13_optimization_2/lectures/lectures_02_parallel.md b/13_optimization_2/lectures/lectures_02_parallel.md
@@ -0,0 +1,178 @@
+---
+title: Optimisation 4 - Parallel execution
+author: Martin Robinson
+date: Oct 2019
+urlcolor: blue
+---
+
+# Levels of parallelism
+
+Can execute code in parallel at many different levels:
+
+1. At the instruction level- Single Instruction Multiple Data (SIMD)
+1. Within the same process/program - Shared memory parallism - e.g. OpenMP
+1. Between different processes on the same computer - Non-shared memory - e.g. Python 
+   multiprocessing 
+1. Between separate machines - Distributed computing - e.g. Message Passing Interface 
+   (MPI)
+
+This lecture will focus on the middle two.
+
+
+# OpenMP
+
+- OpenMP is a set of compiler directives (`#pragma omp ...`) that make writing a shared 
+  memory parallel program significantly easier
+- Supported by most modern compilers (GCC $\ge$ v6.1, Clang++ $\ge$ 3.9, Intel $\ge$ 
+  17.0, Microsoft Visual C++ $\ge$ 2005) 
+- OpenMP uses fork and join model:
+
+![](figs/Fork_join.svg)
+
+# Core elements
+
+![](figs/OpenMP_language_extensions.svg)
+
+# creating threads - parallel regions
+
+- Use the `parallel` directive to create a parallel region
+- program flow splits into $N$ threads ($N$ set by environment variable OMP_NUM_THREADS, 
+  `omp_set_num_threads()` function, or `num_threads` directive)
+- compile using `-fopenmp` for GCC or Clang++, `/openmp` for Visual Studio, or `-openmp` 
+  for intel
+
+```cpp
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+    #pragma omp parallel num_threads(2)
+    {
+      const int id = omp_get_thread_num();
+      const int n = omp_get_num_threads();
+      std::cout << "Hello world. I am thread "<< id <<" of "<< n <<std::endl;
+    }
+    return 0;
+}
+```
+
+# Shared versus private variables
+
+- variables declared before parallel region are shared
+- those declared within parallel regions are private
+- can be explicitly specified using `shared`, `private` directives
+
+```cpp
+int a; // a is shared between threads
+
+#pragma omp parallel
+{
+    int b; // b is private to each thread
+}
+```
+
+# Shared versus private variables
+
+- variables declared before parallel region are shared
+- those declared within parallel regions are private
+- can be explicitly specified using `shared`, `private` directives
+
+```cpp
+int a; // a is shared between threads
+int b; // b is private to each thread
+
+#pragma omp parallel default(none) shared(a) private(b)
+{
+}
+```
+
+# Syncronisation - critical
+
+- Critical regions are executed by one thread at a time
+
+```cpp
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+    #pragma omp parallel
+    {
+      const int id = omp_get_thread_num();
+      const int n = omp_get_num_threads();
+      #progam omp critical
+      {
+      std::cout << "Hello world. I am thread "<< id <<" of "<< n <<std::endl;
+      }
+    }
+    return 0;
+}
+```
+
+# Syncronisation - atomic
+
+- `atomic` directives enable you to serialise a single instruction
+
+```cpp
+int counter = 0;
+#pragma omp parallel 
+{
+    ...
+    #pragma omp atomic
+    counter++
+}
+std::cout << counter << " threads in total"<<std::endl;
+```
+
+# Loop constructs
+
+- Often you are looking to run a specific for loop in parallel
+- OpenMP provides the `omp parallel for` directive to do this automatically
+
+```cpp
+std::vector<double> a(n), b(n);
+
+#pragma omp parallel for
+for (int i = 0; i < n; ++i) {
+  a[i] += b[i];
+}
+```
+
+# Loop constructs
+
+- Up to the programmer to ensure that loop iterations are **independent**
+- This will fail, why?
+
+```cpp
+const int stride = 5;
+std::vector<double> a(n), b(stride*n);
+int j = 0;
+#pragma omp parallel for
+for (int i = 0; i < n; ++i) {
+  j += stride
+  a[i] += b[j];
+}
+```
+
+# Reductions
+
+- The atomic counter above was an example of a reduction, these is a commonly used 
+  pattern in programming, and easily parallelised.
+
+```cpp
+std::vector<double> a(n);
+double sum = 0;
+#pragma omp parallel for reduction(+: sum)
+for (int i = 0; i < n; ++i) {
+  sum += a[i];
+}
+```
+
+- OpenMP provides reductions for these operators: `+, -, *, &, |, ^, &&, ||`
+- If `op` is the operator and `x` the variable, then must use `x = x op expr`, `x = expr 
+  op x` or `x op= expr` within the parallel region
+
+# 
+
+
diff --git a/13_optimization_2/practicals/practical.md b/13_optimization_2/practicals/practical.md
@@ -60,6 +60,24 @@ void interactions(py::array_t<double> xn, py::array_t<double> yn,
    will you get the cell positions from this class back to python? What is the cost of
    evaluating the interactions now, and how does this scale with $N$ (plot this)?
 
+# OpenMP
+
+1. The Mandelbrot set is the set of points $C$ in the complex plane that do not diverge 
+   under the following recurence relation:
+
+$$
+z_{n+1} = z_n + C
+$$
+
+   where $z_0 = C$. The point $C$ is determined to be contained in the set if $z_n$ does 
+   not diverge to infinity. The serial code given in `mandelbrot.cpp` calculates the 
+   area of the (symmetric) mandelbrot set by looping over a set of discrete points in 
+   the box of the upper half of complex plane. The equation above is applied to each 
+   point, and that point is determined to lie within the mandelbrot set if the threshold 
+   condition $|z| \le 2$ is satisfied after 2000 iterations. Parallelise this code using 
+   OpenMP and measure the time spend calculating the area for increasing number of 
+   threads.
+
 
 # Python parallel programming