Skip to content

Commit be7dac9

Browse files
authored
Complete PR on TD 2021/01 (#3)
* Finish previous PR 2021/01 * lint * Update conf.py
1 parent a9e3168 commit be7dac9

22 files changed

+25224
-79
lines changed

README.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
.. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
33
:target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master
44

5-
.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
6-
:target: https://travis-ci.org/sdpython/td3a_cpp
5+
.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
6+
:target: https://app.travis-ci.com/github/sdpython/td3a_cpp
77
:alt: Build status
88

99
.. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
@@ -68,7 +68,7 @@ Or:
6868
::
6969

7070
python -m pytest
71-
71+
7272
To check style:
7373

7474
::

bin/doc.bat

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
99
%pythonexe% -m sphinx -T -b html doc dist/html
1010

1111
if %errorlevel% neq 0 exit /b %errorlevel%
12-
@echo Done Testing.
12+
@echo Done Testing.

bin/flake8.bat

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
99
%pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py
1010

1111
if %errorlevel% neq 0 exit /b %errorlevel%
12-
@echo Done Testing.
12+
@echo Done Testing.

bin/unittest.bat

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
99
%pythonexe% -m unittest discover tests
1010

1111
if %errorlevel% neq 0 exit /b %errorlevel%
12-
@echo Done Testing.
12+
@echo Done Testing.

doc/api.rst

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11

2-
32
===
43
API
54
===

doc/appendix/html.rst

-2
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,8 @@ Visual outputs from example 'Profile a function'
1212
The following pages were generated by the example
1313
:ref:`l-example-dot-profile`.
1414

15-
1615
.. raw:: html
1716
:file: _dot_pyinstrument.html
1817

1918
.. raw:: html
2019
:file: _dotpyspy.svg
21-

doc/appendix/index.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ Appendix
55
Additional pages not easily rendered in other places.
66

77
.. toctree::
8-
8+
99
html

doc/conf.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,14 @@
8989
'Miscellaneous'),
9090
]
9191

92-
intersphinx_mapping = {'https://docs.python.org/': None}
92+
intersphinx_mapping = {
93+
"python": ("https://docs.python.org/{.major}".format(
94+
sys.version_info), None),
95+
"numpy": ("https://numpy.org/doc/stable", None),
96+
"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
97+
"matplotlib": ("https://matplotlib.org/", None),
98+
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None)
99+
}
93100

94101
sphinx_gallery_conf = {
95102
# path to your examples scripts

doc/dev.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Or:
3737
::
3838

3939
python -m pytest
40-
40+
4141
To check style:
4242

4343
::

doc/index.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ td3a_cpp: template for a python module with cython and openmp
55
.. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
66
:target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master
77

8-
.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
9-
:target: https://travis-ci.org/sdpython/td3a_cpp
8+
.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
9+
:target: https://app.travis-ci.com/github/sdpython/td3a_cpp
1010
:alt: Build status
1111

1212
.. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
@@ -38,7 +38,6 @@ to write parallelized algorithm.
3838
.. toctree::
3939
:maxdepth: 2
4040

41-
tutorial
4241
api
4342
auto_examples/index
4443
dev

doc/tutorial.rst

-36
This file was deleted.

examples/plot_benchmark_dot_cython.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ def get_vectors(fct, n, h=100, dtype=numpy.float64):
119119

120120
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
121121
cc.pivot('N', 'fct', 'average').plot(
122-
logy=True, ax=ax[0])
122+
logy=True, ax=ax[0])
123123
cc.pivot('N', 'fct', 'average').plot(
124-
logy=True, logx=True, ax=ax[1])
124+
logy=True, logx=True, ax=ax[1])
125125
ax[0].set_title("Comparison of cython sdot implementations")
126126
ax[1].set_title("Comparison of cython sdot implementations")
127127

examples/plot_benchmark_dot_mul.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@
6969
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
7070
vb=numpy.random.randn(n, n).astype(numpy.float64),
7171
mul=lambda x, y: dmul_cython_omp(
72-
x, y, algo=algo, parallel=parallel),
73-
x_name=n)
74-
for n in sets]
72+
x, y, algo=algo, parallel=parallel),
73+
x_name=n)
74+
for n in sets]
7575

7676
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
7777
dfs.append(DataFrame(res))
@@ -114,9 +114,9 @@
114114
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
115115
vb=numpy.random.randn(n, n).astype(numpy.float64),
116116
mul=lambda x, y: dmul_cython_omp(
117-
x, y, algo=algo, parallel=parallel, b_trans=1),
118-
x_name=n)
119-
for n in sets]
117+
x, y, algo=algo, parallel=parallel, b_trans=1),
118+
x_name=n)
119+
for n in sets]
120120

121121
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
122122
dfs.append(DataFrame(res))
+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
3+
.. _l-example-mul-timeit:
4+
5+
Compares mul implementations with timeit
6+
========================================
7+
8+
:epkg:`numpy` has a very fast implementation of
9+
matrix multiplication. There are many ways to be slower.
10+
The following uses :epkg:`timeit` to compare implementations.
11+
12+
.. contents::
13+
:local:
14+
15+
Preparation
16+
+++++++++++
17+
"""
18+
import timeit
19+
import numpy
20+
21+
from td3a_cpp.tutorial.td_mul_cython import (
22+
multiply_matrix, c_multiply_matrix,
23+
c_multiply_matrix_parallel,
24+
c_multiply_matrix_parallel_transposed as cmulparamtr)
25+
26+
27+
va = numpy.random.randn(150, 100).astype(numpy.float64)
28+
vb = numpy.random.randn(100, 100).astype(numpy.float64)
29+
ctx = {
30+
'va': va, 'vb': vb, 'c_multiply_matrix': c_multiply_matrix,
31+
'multiply_matrix': multiply_matrix,
32+
'c_multiply_matrix_parallel': c_multiply_matrix_parallel,
33+
'c_multiply_matrix_parallel_transposed': cmulparamtr}
34+
35+
##########################################
36+
# Measures
37+
# ++++++++
38+
#
39+
# numpy
40+
res0 = timeit.timeit('va @ vb', number=100, globals=ctx)
41+
print("numpy time", res0)
42+
43+
###########################
44+
# python implementation
45+
46+
res1 = timeit.timeit(
47+
'multiply_matrix(va, vb)', number=10, globals=ctx)
48+
print('python implementation', res1)
49+
50+
51+
###########################
52+
# cython implementation
53+
54+
res2 = timeit.timeit(
55+
'c_multiply_matrix(va, vb)', number=100, globals=ctx)
56+
print('cython implementation', res2)
57+
58+
59+
###########################
60+
# cython implementation parallelized
61+
62+
res3 = timeit.timeit(
63+
'c_multiply_matrix_parallel(va, vb)', number=100, globals=ctx)
64+
print('cython implementation parallelized', res3)
65+
66+
67+
###########################
68+
# cython implementation parallelized, AVX + transposed
69+
70+
res4 = timeit.timeit(
71+
'c_multiply_matrix_parallel_transposed(va, vb)', number=100, globals=ctx)
72+
print('cython implementation parallelized avx', res4)
73+
74+
75+
############################
76+
# Speed up...
77+
78+
print("numpy is %f faster than pure python." % (res1 / res0))
79+
print("numpy is %f faster than cython." % (res2 / res0))
80+
print("numpy is %f faster than parallelized cython." % (res3 / res0))
81+
print("numpy is %f faster than avx parallelized cython." % (res4 / res0))

examples/plot_long_parallel_process_joblib.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ def parallel_dot_joblib(va, vb, max_workers=2):
2929
raise RuntimeError("size must be a multiple of max_workers.")
3030

3131
r = Parallel(n_jobs=max_workers, backend="loky")(
32-
delayed(numpy.dot)(va[i*dhk:i*dhk+dhk], vb[i*dhk:i*dhk+dhk])
33-
for i in range(max_workers * k))
32+
delayed(numpy.dot)(va[i * dhk:i * dhk + dhk],
33+
vb[i * dhk:i * dhk + dhk])
34+
for i in range(max_workers * k))
3435
return sum(r)
3536

3637
###########################

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def get_extension_tutorial(name):
5959
srcs = ['td3a_cpp/tutorial/%s.pyx' % name]
6060
args = get_defined_args()
6161
if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp',
62-
'mul_cython_omp']:
62+
'mul_cython_omp', 'td_mul_cython']:
6363
srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name])
6464
args['language'] = "c++"
6565

td3a_cpp/tutorial/mul_cython_omp.pyx

+31-12
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,32 @@ cimport openmp
1212
cnumpy.import_array()
1313

1414

15+
cdef int _dmul_cython_omp01(const double* va, const double* vb, double* res,
16+
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
17+
cdef Py_ssize_t j, k
18+
for j in range(0, nj):
19+
res[p * nj + j] = 0
20+
for k in range(0, nk):
21+
res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
22+
23+
24+
cdef int _dmul_cython_omp11(const double* va, const double* vb, double* res,
25+
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
26+
cdef Py_ssize_t i, k
27+
for i in range(0, ni):
28+
res[i * nj + p] = 0
29+
for k in range(0, nk):
30+
res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
31+
32+
33+
cdef int _dmul_cython_omp21(const double* va, const double* vb, double* res,
34+
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
35+
cdef Py_ssize_t i, j
36+
for i in range(0, ni):
37+
for j in range(0, nj):
38+
res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
39+
40+
1541
cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
1642
Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk,
1743
cython.int algo, cython.int parallel) nogil:
@@ -57,32 +83,25 @@ cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
5783
if parallel == 1:
5884
if algo == 0:
5985
for p in prange(0, ni):
60-
for j in range(0, nj):
61-
res[p * nj + j] = 0
62-
for k in range(0, nk):
63-
res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
86+
_dmul_cython_omp01(va, vb, res, p, ni, nj, nk)
6487
return 1
6588

6689
if algo == 1:
6790
for p in prange(0, nj):
68-
for i in range(0, ni):
69-
res[i * nj + p] = 0
70-
for k in range(0, nk):
71-
res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
91+
_dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
7292
return 1
7393

7494
if algo == 2:
7595
for p in prange(0, nk):
76-
for i in range(0, ni):
77-
for j in range(0, nj):
78-
res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
96+
_dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
7997
return 1
8098

8199
return 0
82100

83101

84102
cdef extern from "mul_cython_omp_.h":
85-
cdef double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, cython.int size) nogil
103+
cdef double vector_ddot_product_pointer16_sse(
104+
const double *p1, const double *p2, cython.int size) nogil
86105

87106

88107
cdef int _dmul_cython_omp_t(const double* va, const double* vb, double* res,

0 commit comments

Comments
 (0)