Skip to content

Complete PR on TD 2021/01 #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
.. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
:target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master

.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
:target: https://travis-ci.org/sdpython/td3a_cpp
.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
:target: https://app.travis-ci.com/github/sdpython/td3a_cpp
:alt: Build status

.. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
Expand Down Expand Up @@ -68,7 +68,7 @@ Or:
::

python -m pytest

To check style:

::
Expand Down
2 changes: 1 addition & 1 deletion bin/doc.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
%pythonexe% -m sphinx -T -b html doc dist/html

if %errorlevel% neq 0 exit /b %errorlevel%
@echo Done Testing.
@echo Done Testing.
2 changes: 1 addition & 1 deletion bin/flake8.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
%pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py

if %errorlevel% neq 0 exit /b %errorlevel%
@echo Done Testing.
@echo Done Testing.
2 changes: 1 addition & 1 deletion bin/unittest.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
%pythonexe% -m unittest discover tests

if %errorlevel% neq 0 exit /b %errorlevel%
@echo Done Testing.
@echo Done Testing.
1 change: 0 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@


===
API
===
Expand Down
2 changes: 0 additions & 2 deletions doc/appendix/html.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ Visual outputs from example 'Profile a function'
The following pages were generated by the example
:ref:`l-example-dot-profile`.


.. raw:: html
:file: _dot_pyinstrument.html

.. raw:: html
:file: _dotpyspy.svg

2 changes: 1 addition & 1 deletion doc/appendix/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ Appendix
Additional pages not easily rendered in other places.

.. toctree::

html
9 changes: 8 additions & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,14 @@
'Miscellaneous'),
]

intersphinx_mapping = {'https://docs.python.org/': None}
intersphinx_mapping = {
"python": ("https://docs.python.org/{.major}".format(
sys.version_info), None),
"numpy": ("https://numpy.org/doc/stable", None),
"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
"matplotlib": ("https://matplotlib.org/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None)
}

sphinx_gallery_conf = {
# path to your examples scripts
Expand Down
2 changes: 1 addition & 1 deletion doc/dev.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Or:
::

python -m pytest

To check style:

::
Expand Down
5 changes: 2 additions & 3 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ td3a_cpp: template for a python module with cython and openmp
.. image:: https://circleci.com/gh/sdpython/td3a_cpp/tree/master.svg?style=svg
:target: https://circleci.com/gh/sdpython/td3a_cpp/tree/master

.. image:: https://travis-ci.org/sdpython/td3a_cpp.svg?branch=master
:target: https://travis-ci.org/sdpython/td3a_cpp
.. image:: https://travis-ci.com/sdpython/td3a_cpp.svg?branch=master
:target: https://app.travis-ci.com/github/sdpython/td3a_cpp
:alt: Build status

.. image:: https://ci.appveyor.com/api/projects/status/wvo6ovlaxi8ypua4?svg=true
Expand Down Expand Up @@ -38,7 +38,6 @@ to write parallelized algorithm.
.. toctree::
:maxdepth: 2

tutorial
api
auto_examples/index
dev
Expand Down
36 changes: 0 additions & 36 deletions doc/tutorial.rst

This file was deleted.

4 changes: 2 additions & 2 deletions examples/plot_benchmark_dot_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def get_vectors(fct, n, h=100, dtype=numpy.float64):

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
cc.pivot('N', 'fct', 'average').plot(
logy=True, ax=ax[0])
logy=True, ax=ax[0])
cc.pivot('N', 'fct', 'average').plot(
logy=True, logx=True, ax=ax[1])
logy=True, logx=True, ax=ax[1])
ax[0].set_title("Comparison of cython sdot implementations")
ax[1].set_title("Comparison of cython sdot implementations")

Expand Down
12 changes: 6 additions & 6 deletions examples/plot_benchmark_dot_mul.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=lambda x, y: dmul_cython_omp(
x, y, algo=algo, parallel=parallel),
x_name=n)
for n in sets]
x, y, algo=algo, parallel=parallel),
x_name=n)
for n in sets]

res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
dfs.append(DataFrame(res))
Expand Down Expand Up @@ -114,9 +114,9 @@
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=lambda x, y: dmul_cython_omp(
x, y, algo=algo, parallel=parallel, b_trans=1),
x_name=n)
for n in sets]
x, y, algo=algo, parallel=parallel, b_trans=1),
x_name=n)
for n in sets]

res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
dfs.append(DataFrame(res))
Expand Down
81 changes: 81 additions & 0 deletions examples/plot_benchmark_dot_mul_timeit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""

.. _l-example-mul-timeit:

Compares mul implementations with timeit
========================================

:epkg:`numpy` has a very fast implementation of
matrix multiplication. There are many ways to be slower.
The following uses :epkg:`timeit` to compare implementations.

.. contents::
:local:

Preparation
+++++++++++
"""
import timeit
import numpy

from td3a_cpp.tutorial.td_mul_cython import (
multiply_matrix, c_multiply_matrix,
c_multiply_matrix_parallel,
c_multiply_matrix_parallel_transposed as cmulparamtr)


va = numpy.random.randn(150, 100).astype(numpy.float64)
vb = numpy.random.randn(100, 100).astype(numpy.float64)
ctx = {
'va': va, 'vb': vb, 'c_multiply_matrix': c_multiply_matrix,
'multiply_matrix': multiply_matrix,
'c_multiply_matrix_parallel': c_multiply_matrix_parallel,
'c_multiply_matrix_parallel_transposed': cmulparamtr}

##########################################
# Measures
# ++++++++
#
# numpy
res0 = timeit.timeit('va @ vb', number=100, globals=ctx)
print("numpy time", res0)

###########################
# python implementation

res1 = timeit.timeit(
'multiply_matrix(va, vb)', number=10, globals=ctx)
print('python implementation', res1)


###########################
# cython implementation

res2 = timeit.timeit(
'c_multiply_matrix(va, vb)', number=100, globals=ctx)
print('cython implementation', res2)


###########################
# cython implementation parallelized

res3 = timeit.timeit(
'c_multiply_matrix_parallel(va, vb)', number=100, globals=ctx)
print('cython implementation parallelized', res3)


###########################
# cython implementation parallelized, AVX + transposed

res4 = timeit.timeit(
'c_multiply_matrix_parallel_transposed(va, vb)', number=100, globals=ctx)
print('cython implementation parallelized avx', res4)


############################
# Speed up...

print("numpy is %f faster than pure python." % (res1 / res0))
print("numpy is %f faster than cython." % (res2 / res0))
print("numpy is %f faster than parallelized cython." % (res3 / res0))
print("numpy is %f faster than avx parallelized cython." % (res4 / res0))
5 changes: 3 additions & 2 deletions examples/plot_long_parallel_process_joblib.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ def parallel_dot_joblib(va, vb, max_workers=2):
raise RuntimeError("size must be a multiple of max_workers.")

r = Parallel(n_jobs=max_workers, backend="loky")(
delayed(numpy.dot)(va[i*dhk:i*dhk+dhk], vb[i*dhk:i*dhk+dhk])
for i in range(max_workers * k))
delayed(numpy.dot)(va[i * dhk:i * dhk + dhk],
vb[i * dhk:i * dhk + dhk])
for i in range(max_workers * k))
return sum(r)

###########################
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_extension_tutorial(name):
srcs = ['td3a_cpp/tutorial/%s.pyx' % name]
args = get_defined_args()
if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp',
'mul_cython_omp']:
'mul_cython_omp', 'td_mul_cython']:
srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name])
args['language'] = "c++"

Expand Down
43 changes: 31 additions & 12 deletions td3a_cpp/tutorial/mul_cython_omp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,32 @@ cimport openmp
cnumpy.import_array()


cdef int _dmul_cython_omp01(const double* va, const double* vb, double* res,
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
cdef Py_ssize_t j, k
for j in range(0, nj):
res[p * nj + j] = 0
for k in range(0, nk):
res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]


cdef int _dmul_cython_omp11(const double* va, const double* vb, double* res,
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
cdef Py_ssize_t i, k
for i in range(0, ni):
res[i * nj + p] = 0
for k in range(0, nk):
res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]


cdef int _dmul_cython_omp21(const double* va, const double* vb, double* res,
Py_ssize_t p, Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk) nogil:
cdef Py_ssize_t i, j
for i in range(0, ni):
for j in range(0, nj):
res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]


cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk,
cython.int algo, cython.int parallel) nogil:
Expand Down Expand Up @@ -57,32 +83,25 @@ cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
if parallel == 1:
if algo == 0:
for p in prange(0, ni):
for j in range(0, nj):
res[p * nj + j] = 0
for k in range(0, nk):
res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
_dmul_cython_omp01(va, vb, res, p, ni, nj, nk)
return 1

if algo == 1:
for p in prange(0, nj):
for i in range(0, ni):
res[i * nj + p] = 0
for k in range(0, nk):
res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
_dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
return 1

if algo == 2:
for p in prange(0, nk):
for i in range(0, ni):
for j in range(0, nj):
res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
_dmul_cython_omp11(va, vb, res, p, ni, nj, nk)
return 1

return 0


cdef extern from "mul_cython_omp_.h":
cdef double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, cython.int size) nogil
cdef double vector_ddot_product_pointer16_sse(
const double *p1, const double *p2, cython.int size) nogil


cdef int _dmul_cython_omp_t(const double* va, const double* vb, double* res,
Expand Down
Loading