Complete PR on TD 2021/01 (#3)

sdpython · web-flow · commit be7dac94f848 · 2021-10-09T11:27:31.000+02:00
* Finish previous PR 2021/01
* lint
* Update conf.py
diff --git a/README.rst b/README.rst
@@ -2,8 +2,8 @@
 .. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
     :target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master
 
-.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
-    :target: https://travis-ci.org/sdpython/td3a_cpp
+.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
+    :target: https://app.travis-ci.com/github/sdpython/td3a_cpp
     :alt: Build status
 
 .. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
@@ -68,7 +68,7 @@ Or:
 ::
 
     python -m pytest
-    
+
 To check style:
 
 ::
diff --git a/bin/doc.bat b/bin/doc.bat
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 %pythonexe% -m sphinx -T -b html doc dist/html
 
 if %errorlevel% neq 0 exit /b %errorlevel%
-@echo Done Testing.
+@echo Done Testing.
diff --git a/bin/flake8.bat b/bin/flake8.bat
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 %pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py
 
 if %errorlevel% neq 0 exit /b %errorlevel%
-@echo Done Testing.
+@echo Done Testing.
diff --git a/bin/unittest.bat b/bin/unittest.bat
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 %pythonexe% -m unittest discover tests
 
 if %errorlevel% neq 0 exit /b %errorlevel%
-@echo Done Testing.
+@echo Done Testing.
diff --git a/doc/api.rst b/doc/api.rst
@@ -1,5 +1,4 @@
 
-
 ===
 API
 ===
diff --git a/doc/appendix/html.rst b/doc/appendix/html.rst
@@ -12,10 +12,8 @@ Visual outputs from example 'Profile a function'
 The following pages were generated by the example
 :ref:`l-example-dot-profile`.
 
-
 .. raw:: html
     :file: _dot_pyinstrument.html
 
 .. raw:: html
     :file: _dotpyspy.svg
-
diff --git a/doc/appendix/index.rst b/doc/appendix/index.rst
@@ -5,5 +5,5 @@ Appendix
 Additional pages not easily rendered in other places.
 
 .. toctree::
-    
+
     html
diff --git a/doc/conf.py b/doc/conf.py
@@ -89,7 +89,14 @@
      'Miscellaneous'),
 ]
 
-intersphinx_mapping = {'https://docs.python.org/': None}
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/{.major}".format(
+        sys.version_info), None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
+    "matplotlib": ("https://matplotlib.org/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None)
+}
 
 sphinx_gallery_conf = {
     # path to your examples scripts
diff --git a/doc/dev.rst b/doc/dev.rst
@@ -37,7 +37,7 @@ Or:
 ::
 
     python -m pytest
-    
+
 To check style:
 
 ::
diff --git a/doc/index.rst b/doc/index.rst
@@ -5,8 +5,8 @@ td3a_cpp: template for a python module with cython and openmp
 .. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
     :target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master
 
-.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
-    :target: https://travis-ci.org/sdpython/td3a_cpp
+.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
+    :target: https://app.travis-ci.com/github/sdpython/td3a_cpp
     :alt: Build status
 
 .. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
@@ -38,7 +38,6 @@ to write parallelized algorithm.
 .. toctree::
     :maxdepth: 2
 
-    tutorial
     api
     auto_examples/index
     dev
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
diff --git a/examples/plot_benchmark_dot_cython.py b/examples/plot_benchmark_dot_cython.py
@@ -119,9 +119,9 @@ def get_vectors(fct, n, h=100, dtype=numpy.float64):
 
 fig, ax = plt.subplots(1, 2, figsize=(10, 4))
 cc.pivot('N', 'fct', 'average').plot(
-         logy=True, ax=ax[0])
+    logy=True, ax=ax[0])
 cc.pivot('N', 'fct', 'average').plot(
-         logy=True, logx=True, ax=ax[1])
+    logy=True, logx=True, ax=ax[1])
 ax[0].set_title("Comparison of cython sdot implementations")
 ax[1].set_title("Comparison of cython sdot implementations")
 
diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py
@@ -69,9 +69,9 @@
         ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
                      vb=numpy.random.randn(n, n).astype(numpy.float64),
                      mul=lambda x, y: dmul_cython_omp(
-                        x, y, algo=algo, parallel=parallel),
-                     x_name=n)
-                for n in sets]
+            x, y, algo=algo, parallel=parallel),
+            x_name=n)
+            for n in sets]
 
         res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
         dfs.append(DataFrame(res))
@@ -114,9 +114,9 @@
         ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
                      vb=numpy.random.randn(n, n).astype(numpy.float64),
                      mul=lambda x, y: dmul_cython_omp(
-                        x, y, algo=algo, parallel=parallel, b_trans=1),
-                     x_name=n)
-                for n in sets]
+            x, y, algo=algo, parallel=parallel, b_trans=1),
+            x_name=n)
+            for n in sets]
 
         res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
         dfs.append(DataFrame(res))
diff --git a/examples/plot_benchmark_dot_mul_timeit.py b/examples/plot_benchmark_dot_mul_timeit.py
@@ -0,0 +1,81 @@
+"""
+
+.. _l-example-mul-timeit:
+
+Compares mul implementations with timeit
+========================================
+
+:epkg:`numpy` has a very fast implementation of
+matrix multiplication. There are many ways to be slower.
+The following uses :epkg:`timeit` to compare implementations.
+
+.. contents::
+    :local:
+
+Preparation
++++++++++++
+"""
+import timeit
+import numpy
+
+from td3a_cpp.tutorial.td_mul_cython import (
+    multiply_matrix, c_multiply_matrix,
+    c_multiply_matrix_parallel,
+    c_multiply_matrix_parallel_transposed as cmulparamtr)
+
+
+va = numpy.random.randn(150, 100).astype(numpy.float64)
+vb = numpy.random.randn(100, 100).astype(numpy.float64)
+ctx = {
+    'va': va, 'vb': vb, 'c_multiply_matrix': c_multiply_matrix,
+    'multiply_matrix': multiply_matrix,
+    'c_multiply_matrix_parallel': c_multiply_matrix_parallel,
+    'c_multiply_matrix_parallel_transposed': cmulparamtr}
+
+##########################################
+# Measures
+# ++++++++
+#
+# numpy
+res0 = timeit.timeit('va @ vb', number=100, globals=ctx)
+print("numpy time", res0)
+
+###########################
+# python implementation
+
+res1 = timeit.timeit(
+    'multiply_matrix(va, vb)', number=10, globals=ctx)
+print('python implementation', res1)
+
+
+###########################
+# cython implementation
+
+res2 = timeit.timeit(
+    'c_multiply_matrix(va, vb)', number=100, globals=ctx)
+print('cython implementation', res2)
+
+
+###########################
+# cython implementation parallelized
+
+res3 = timeit.timeit(
+    'c_multiply_matrix_parallel(va, vb)', number=100, globals=ctx)
+print('cython implementation parallelized', res3)
+
+
+###########################
+# cython implementation parallelized, AVX + transposed
+
+res4 = timeit.timeit(
+    'c_multiply_matrix_parallel_transposed(va, vb)', number=100, globals=ctx)
+print('cython implementation parallelized avx', res4)
+
+
+############################
+# Speed up...
+
+print("numpy is %f faster than pure python." % (res1 / res0))
+print("numpy is %f faster than cython." % (res2 / res0))
+print("numpy is %f faster than parallelized cython." % (res3 / res0))
+print("numpy is %f faster than avx parallelized cython." % (res4 / res0))
diff --git a/examples/plot_long_parallel_process_joblib.py b/examples/plot_long_parallel_process_joblib.py
@@ -29,8 +29,9 @@ def parallel_dot_joblib(va, vb, max_workers=2):
         raise RuntimeError("size must be a multiple of max_workers.")
 
     r = Parallel(n_jobs=max_workers, backend="loky")(
-            delayed(numpy.dot)(va[i*dhk:i*dhk+dhk], vb[i*dhk:i*dhk+dhk])
-            for i in range(max_workers * k))
+        delayed(numpy.dot)(va[i * dhk:i * dhk + dhk],
+                           vb[i * dhk:i * dhk + dhk])
+        for i in range(max_workers * k))
     return sum(r)
 
 ###########################
diff --git a/setup.py b/setup.py
@@ -59,7 +59,7 @@ def get_extension_tutorial(name):
     srcs = ['td3a_cpp/tutorial/%s.pyx' % name]
     args = get_defined_args()
     if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp',
-                'mul_cython_omp']:
+                'mul_cython_omp', 'td_mul_cython']:
         srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name])
         args['language'] = "c++"
 
diff --git a/td3a_cpp/tutorial/mul_cython_omp.pyx b/td3a_cpp/tutorial/mul_cython_omp.pyx
@@ -12,6 +12,32 @@ cimport openmp
 cnumpy.import_array()
 
 
+cdef int _dmul_cython_omp01(const double* va, const double* vb, double* res,
+                            Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
+    cdef Py_ssize_t j, k
+    for j in range(0, nj):
+        res[p * nj + j] = 0
+        for k in range(0, nk):
+            res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
+
+
+cdef int _dmul_cython_omp11(const double* va, const double* vb, double* res,
+                            Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
+    cdef Py_ssize_t i, k
+    for i in range(0, ni):
+        res[i * nj + p] = 0
+        for k in range(0, nk):
+            res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
+
+
+cdef int _dmul_cython_omp21(const double* va, const double* vb, double* res,
+                            Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
+    cdef Py_ssize_t i, j
+    for i in range(0, ni):
+        for j in range(0, nj):
+            res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
+
+
 cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
                           Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk,
                           cython.int algo, cython.int parallel) nogil:
@@ -57,32 +83,25 @@ cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
     if parallel == 1:
         if algo == 0:
             for p in prange(0, ni):
-                for j in range(0, nj):
-                    res[p * nj + j] = 0
-                    for k in range(0, nk):
-                        res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
+                _dmul_cython_omp01(va, vb, res, p, ni, nj, nk)
             return 1
 
         if algo == 1:
             for p in prange(0, nj):
-                for i in range(0, ni):
-                    res[i * nj + p] = 0
-                    for k in range(0, nk):
-                        res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
+                _dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
             return 1
 
         if algo == 2:
             for p in prange(0, nk):
-                for i in range(0, ni):
-                    for j in range(0, nj):
-                        res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
+                _dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
             return 1
     
     return 0
 
 
 cdef extern from "mul_cython_omp_.h":
-    cdef double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, cython.int size) nogil
+    cdef double vector_ddot_product_pointer16_sse(
+        const double *p1, const double *p2, cython.int size) nogil
 
 
 cdef int _dmul_cython_omp_t(const double* va, const double* vb, double* res,
diff --git a/td3a_cpp/tutorial/td_mul_cython.cpp b/td3a_cpp/tutorial/td_mul_cython.cpp
diff --git a/td3a_cpp/tutorial/td_mul_cython.pyx b/td3a_cpp/tutorial/td_mul_cython.pyx
diff --git a/td3a_cpp/tutorial/td_mul_cython_.cpp b/td3a_cpp/tutorial/td_mul_cython_.cpp
diff --git a/td3a_cpp/tutorial/td_mul_cython_.h b/td3a_cpp/tutorial/td_mul_cython_.h
diff --git a/tests/test_tutorial_td.py b/tests/test_tutorial_td.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`
`2`		`-`
`3`	`2`	`===`
`4`	`3`	`API`
`5`	`4`	`===`