Skip to content

Commit 7b29fe9

Browse files
authored
Merge pull request #24 from tensor-compiler/variants-and-fusing
numpy,taco: updates to windowing benchmarks, add a fusing benchmark
2 parents 3317204 + e7fb9c5 commit 7b29fe9

File tree

7 files changed

+98
-41
lines changed

7 files changed

+98
-41
lines changed

numpy/ufuncs.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sparse
44
import pytest
55
import os
6-
from util import TensorCollectionFROSTT, PydataTensorShifter, TensorCollectionSuiteSparse, ScipyTensorShifter, PydataMatrixMarketTensorLoader, ScipyMatrixMarketTensorLoader, VALIDATION_OUTPUT_PATH, PydataSparseTensorDumper, SuiteSparseTensor, safeCastPydataTensorToInts
6+
from util import TensorCollectionFROSTT, PydataTensorShifter, TensorCollectionSuiteSparse, ScipyTensorShifter, PydataMatrixMarketTensorLoader, ScipyMatrixMarketTensorLoader, VALIDATION_OUTPUT_PATH, PydataSparseTensorDumper, SuiteSparseTensor, safeCastPydataTensorToInts, RandomPydataSparseTensorLoader
77

88
# TODO (rohany): Ask hameer about this. pydata/sparse isn't happy when
99
# given this ufunc to evaluate.
@@ -34,6 +34,16 @@ def bench():
3434
C = ufunc(A, B)
3535
tacoBench(bench)
3636

37+
@pytest.mark.parametrize("dim", [5000, 10000, 20000])
38+
def bench_pydata_ufunc_fused(tacoBench, dim):
39+
loader = RandomPydataSparseTensorLoader()
40+
matrix = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01))
41+
matrix1 = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01, variant=1))
42+
matrix2 = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01, variant=2))
43+
def bench():
44+
result = numpy.logical_and(numpy.logical_xor(matrix, matrix1), matrix2)
45+
return result
46+
tacoBench(bench)
3747

3848
def import_tensor(filename, dim):
3949
print(filename)

numpy/util.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,13 @@ def dump(self, tensor, path):
9292
# The key itself is formatted by the dimensions, followed by the
9393
# sparsity. For example, a 250 by 250 tensor with sparsity 0.01
9494
# would have a key of 250x250-0.01.tns.
95-
def construct_random_tensor_key(shape, sparsity):
95+
def construct_random_tensor_key(shape, sparsity, variant):
9696
path = TENSOR_PATH
9797
dims = "x".join([str(dim) for dim in shape])
98-
key = "{}-{}.tns".format(dims, sparsity)
98+
if variant is None:
99+
key = "{}-{}.tns".format(dims, sparsity)
100+
else:
101+
key = "{}-{}-{}.tns".format(dims, sparsity, variant)
99102
return os.path.join(path, "random", key)
100103

101104
# RandomPydataSparseTensorLoader should be used to generate
@@ -106,8 +109,8 @@ class RandomPydataSparseTensorLoader:
106109
def __init__(self):
107110
self.loader = PydataSparseTensorLoader()
108111

109-
def random(self, shape, sparsity):
110-
key = construct_random_tensor_key(shape, sparsity)
112+
def random(self, shape, sparsity, variant=None):
113+
key = construct_random_tensor_key(shape, sparsity, variant)
111114
# If a tensor with these properties exists already, then load it.
112115
if os.path.exists(key):
113116
return self.loader.load(key)
@@ -126,9 +129,9 @@ def __init__(self, format):
126129
self.loader = ScipySparseTensorLoader(format)
127130
self.format = format
128131

129-
def random(self, shape, sparsity):
132+
def random(self, shape, sparsity, variant=None):
130133
assert(len(shape) == 2)
131-
key = construct_random_tensor_key(shape, sparsity)
134+
key = construct_random_tensor_key(shape, sparsity, variant)
132135
# If a tensor with these properties exists already, then load it.
133136
if os.path.exists(key):
134137
return self.loader.load(key)

numpy/windowing.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
sizeConfigs = ["Constant", "ConstantFraction", "AlmostWhole", "Whole", "NoWindowing"]
1515

1616
def sliceTensor(tensor, dim, config):
17-
return tensor
1817
if config == "Constant":
1918
return tensor[250:750, 250:750]
2019
elif config == "ConstantFraction":
@@ -36,9 +35,11 @@ def sliceTensor(tensor, dim, config):
3635
def bench_add_sparse_window(tacoBench, dim, format, config):
3736
loader = RandomScipySparseTensorLoader(format)
3837
matrix = loader.random((dim, dim), 0.01)
38+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
3939
def bench():
4040
x = sliceTensor(matrix, dim, config)
41-
res = x + x
41+
x2 = sliceTensor(matrix2, dim, config)
42+
res = x + x2
4243
# Sanity check that this has a similar runtime as taco.
4344
# res = matrix + matrix
4445
tacoBench(bench)
@@ -50,22 +51,24 @@ def bench():
5051
def bench_add_pydata_sparse_window(tacoBench, dim, config):
5152
loader = RandomPydataSparseTensorLoader()
5253
matrix = loader.random((dim, dim), 0.01)
54+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
5355
def bench():
5456
x = sliceTensor(matrix, dim, config)
55-
res = x + x
57+
x2 = sliceTensor(matrix2, dim, config)
58+
res = x + x2
5659
tacoBench(bench)
5760

58-
# TODO (rohany): Parametrize the below tests by appropriate windowing config.
59-
6061
@pytest.mark.parametrize("dim", [5000, 10000, 20000])
6162
@pytest.mark.parametrize("format", ['csr', 'csc'])
6263
@pytest.mark.parametrize("strideWidth", [2, 4, 8])
6364
def bench_add_sparse_strided_window(tacoBench, dim, format, strideWidth):
6465
loader = RandomScipySparseTensorLoader(format)
6566
matrix = loader.random((dim, dim), 0.01)
67+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
6668
def bench():
6769
x = matrix[0:dim:strideWidth, 0:dim:strideWidth]
68-
res = x + x
70+
x2 = matrix2[0:dim:strideWidth, 0:dim:strideWidth]
71+
res = x + x2
6972
tacoBench(bench)
7073

7174
@pytest.mark.parametrize("dim", [5000, 10000, 20000])
@@ -75,19 +78,23 @@ def bench_add_sparse_index_set(tacoBench, dim, format, fraction):
7578
indexes = [i * fraction for i in range(0, dim//fraction)]
7679
loader = RandomScipySparseTensorLoader(format)
7780
matrix = loader.random((dim, dim), 0.01)
81+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
7882
def bench():
7983
x = matrix[:, indexes]
80-
res = x + x
84+
x2 = matrix2[:, indexes]
85+
res = x + x2
8186
tacoBench(bench)
8287

8388
@pytest.mark.parametrize("dim", [5000, 10000, 20000])
8489
@pytest.mark.parametrize("strideWidth", [2, 4, 8])
8590
def bench_add_pydata_sparse_strided_window(tacoBench, dim, strideWidth):
8691
loader = RandomPydataSparseTensorLoader()
8792
matrix = loader.random((dim, dim), 0.01)
93+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
8894
def bench():
8995
x = matrix[0:dim:strideWidth, 0:dim:strideWidth]
90-
res = x + x
96+
x2 = matrix[0:dim:strideWidth, 0:dim:strideWidth]
97+
res = x + x2
9198
tacoBench(bench)
9299

93100
# TODO (rohany): This is really slow (compared to scipy.sparse). Check with hameer
@@ -98,12 +105,13 @@ def bench_add_pydata_sparse_index_set(tacoBench, dim, fraction):
98105
loader = RandomPydataSparseTensorLoader()
99106
indexes = [i * fraction for i in range(0, dim//fraction)]
100107
matrix = loader.random((dim, dim), 0.01)
108+
matrix2 = loader.random((dim, dim), 0.01, variant=1)
101109
def bench():
102110
x = matrix[:, indexes]
103-
res = x + x
111+
x = matrix2[:, indexes]
112+
res = x + x2
104113
tacoBench(bench)
105114

106-
# TODO (rohany): I don't know if we care about this benchmark.
107115
@pytest.mark.parametrize("dim", [5000, 10000, 20000])
108116
@pytest.mark.parametrize("format", ['csr', 'csc'])
109117
@pytest.mark.skip(reason="not using currently")

taco/bench.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,26 @@ std::string cleanPath(std::string path) {
3838
return result;
3939
}
4040

41-
std::string constructRandomTensorKey(std::vector<int> dims, float sparsity) {
41+
std::string constructRandomTensorKey(std::vector<int> dims, float sparsity, int variant) {
4242
auto path = getTacoTensorPath();
4343
std::stringstream result;
4444
result << path;
4545
if (path[path.size() - 1] != '/') {
4646
result << "/";
4747
}
4848
result << "random/";
49-
result << taco::util::join(dims, "x") << "-" << sparsity << ".tns";
49+
if (variant == 0) {
50+
result << taco::util::join(dims, "x") << "-" << sparsity << ".tns";
51+
} else {
52+
result << taco::util::join(dims, "x") << "-" << sparsity << "-" << variant << ".tns";
53+
}
5054
return result.str();
5155
}
5256

53-
taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format) {
57+
taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format, int variant) {
5458
// For now, just say that the python code must generate the random
5559
// tensor before use.
56-
auto tensor = taco::read(constructRandomTensorKey(dims, sparsity), format, true);
60+
auto tensor = taco::read(constructRandomTensorKey(dims, sparsity, variant), format, true);
5761
tensor.setName(name);
5862
return tensor;
5963
}

taco/bench.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ std::string getTacoTensorPath();
4949
std::string getValidationOutputPath();
5050
// cleanPath ensures that the input path ends with "/".
5151
std::string cleanPath(std::string path);
52-
taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format);
52+
taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format, int variant=0);
5353

5454
template<typename T>
5555
taco::Tensor<T> castToType(std::string name, taco::Tensor<double> tensor) {

taco/ufuncs.cpp

+33-4
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ struct xorAlgebra {
3333
}
3434
};
3535

36+
struct andAlgebra {
37+
IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
38+
return Intersect(regions[0], regions[1]);
39+
}
40+
};
41+
3642
struct RightShift{
3743
ir::Expr operator()(const std::vector<ir::Expr> &v) {
3844
if (v.size() == 1)
@@ -155,6 +161,33 @@ static void applyBenchSizes(benchmark::internal::Benchmark* b) {
155161
TACO_BENCH_ARGS(bench_ufunc_sparse, xor_0.01, 0.01, "xor")->Apply(applyBenchSizes);
156162
TACO_BENCH_ARGS(bench_ufunc_sparse, rightShift_0.01, 0.01, ">>")->Apply(applyBenchSizes);
157163

164+
Func ldExp("ldexp", Ldexp(), leftIncAlgebra());
165+
Func rightShift("right_shift", RightShift(), leftIncAlgebra());
166+
Func xorOp("logical_xor", GeneralAdd(), xorAlgebra());
167+
Func andOp("logical_and", GeneralAdd(), andAlgebra());
168+
169+
static void bench_ufunc_fused(benchmark::State& state, const Format& f) {
170+
int dim = state.range(0);
171+
auto sparsity = 0.01;
172+
Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
173+
Tensor<double> matrix1 = loadRandomTensor("B", {dim, dim}, sparsity, f, 1 /* variant */);
174+
Tensor<double> matrix2 = loadRandomTensor("C", {dim, dim}, sparsity, f, 2 /* variant */);
175+
176+
for (auto _ : state) {
177+
state.PauseTiming();
178+
Tensor<double> result("result", {dim, dim}, f);
179+
IndexVar i("i"), j("j");
180+
result(i, j) = andOp(xorOp(matrix(i, j), matrix1(i, j)), matrix2(i, j));
181+
result.setAssembleWhileCompute(true);
182+
result.compile();
183+
state.ResumeTiming();
184+
185+
result.compute();
186+
}
187+
}
188+
TACO_BENCH_ARGS(bench_ufunc_fused, csr, CSR)
189+
->ArgsProduct({{5000, 10000, 20000}});
190+
158191
// UfuncInputCache is a cache for the input to ufunc benchmarks. These benchmarks
159192
// operate on a tensor loaded from disk and the same tensor shifted slightly. Since
160193
// these operations are run multiple times, we can save alot in benchmark startup
@@ -242,10 +275,6 @@ static void bench_frostt_ufunc(benchmark::State& state, std::string tnsPath, Fun
242275
}
243276
}
244277

245-
Func ldExp("ldexp", Ldexp(), leftIncAlgebra());
246-
Func rightShift("right_shift", RightShift(), leftIncAlgebra());
247-
Func xorOp("logical_xor", GeneralAdd(), xorAlgebra());
248-
249278
#define FOREACH_FROSTT_TENSOR(__func__) \
250279
__func__(nips, "nips.tns") \
251280
__func__(uber-pickups, "uber-pickups.tns") \

taco/windowing.cpp

+18-15
Original file line numberDiff line numberDiff line change
@@ -28,34 +28,34 @@ enum WindowConfig {
2828
__func__(Whole, Whole) \
2929
__func__(NoWindowing, NoWindowing)
3030

31-
Tensor<double> windowedTensorOp(Tensor<double> input, int dim, WindowConfig config) {
31+
Tensor<double> windowedTensorOp(Tensor<double> input1, Tensor<double> input2, int dim, WindowConfig config) {
3232
IndexVar i, j;
3333
switch (config) {
3434
case Constant: {
35-
Tensor<double> result("B", {500, 500}, input.getFormat());
36-
result(i, j) = input(i(250, 750), j(250, 750)) + input(i(250, 750), j(250, 750));
35+
Tensor<double> result("B", {500, 500}, input1.getFormat());
36+
result(i, j) = input1(i(250, 750), j(250, 750)) + input2(i(250, 750), j(250, 750));
3737
return result;
3838
}
3939
case ConstantFraction: {
4040
int size = dim / 4;
4141
int start = dim / 4;
42-
Tensor<double> result("B", {size, size}, input.getFormat());
43-
result(i, j) = input(i(start, start + size), j(start, start + size)) + input(i(start, start + size), j(start, start + size));
42+
Tensor<double> result("B", {size, size}, input1.getFormat());
43+
result(i, j) = input1(i(start, start + size), j(start, start + size)) + input2(i(start, start + size), j(start, start + size));
4444
return result;
4545
}
4646
case AlmostWhole: {
47-
Tensor<double> result("B", {dim-2, dim-2}, input.getFormat());
48-
result(i, j) = input(i(1, dim-1), j(1, dim-1)) + input(i(1, dim-1), j(1, dim-1));
47+
Tensor<double> result("B", {dim-2, dim-2}, input1.getFormat());
48+
result(i, j) = input1(i(1, dim-1), j(1, dim-1)) + input2(i(1, dim-1), j(1, dim-1));
4949
return result;
5050
}
5151
case Whole: {
52-
Tensor<double> result("B", {dim, dim}, input.getFormat());
53-
result(i, j) = input(i(0, dim), j(0, dim)) + input(i(0, dim), j(0, dim));
52+
Tensor<double> result("B", {dim, dim}, input1.getFormat());
53+
result(i, j) = input1(i(0, dim), j(0, dim)) + input2(i(0, dim), j(0, dim));
5454
return result;
5555
}
5656
case NoWindowing: {
57-
Tensor<double> result("B", {dim, dim}, input.getFormat());
58-
result(i, j) = input(i, j) + input(i, j);
57+
Tensor<double> result("B", {dim, dim}, input1.getFormat());
58+
result(i, j) = input1(i, j) + input2(i, j);
5959
return result;
6060
}
6161
default:
@@ -67,12 +67,13 @@ static void bench_add_sparse_window(benchmark::State& state, const Format& f, Wi
6767
int dim = state.range(0);
6868
auto sparsity = 0.01;
6969
Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
70+
Tensor<double> matrix2 = loadRandomTensor("A2", {dim, dim}, sparsity, f, 1 /* variant */);
7071
matrix.pack();
7172

7273
for (auto _ : state) {
7374
// Setup.
7475
state.PauseTiming();
75-
auto result = windowedTensorOp(matrix, dim, config);
76+
auto result = windowedTensorOp(matrix, matrix2, dim, config);
7677
result.setAssembleWhileCompute(true);
7778
result.compile();
7879
state.ResumeTiming();
@@ -92,14 +93,15 @@ static void bench_add_sparse_strided_window(benchmark::State& state, const Forma
9293
int strideWidth = state.range(1);
9394
auto sparsity = 0.01;
9495
Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
96+
Tensor<double> matrix2 = loadRandomTensor("A2", {dim, dim}, sparsity, f, 1 /* variant */);
9597
matrix.pack();
9698

9799
for (auto _ : state) {
98100
// Setup.
99101
state.PauseTiming();
100102
Tensor<double> result("B", {dim/strideWidth, dim/strideWidth}, f);
101103
IndexVar i, j;
102-
result(i, j) = matrix(i(0, dim, strideWidth), j(0, dim, strideWidth)) + matrix(i(0, dim, strideWidth), j(0, dim, strideWidth));
104+
result(i, j) = matrix(i(0, dim, strideWidth), j(0, dim, strideWidth)) + matrix2(i(0, dim, strideWidth), j(0, dim, strideWidth));
103105
result.setAssembleWhileCompute(true);
104106
result.compile();
105107
state.ResumeTiming();
@@ -118,6 +120,7 @@ static void bench_add_sparse_index_set(benchmark::State& state, const Format& f)
118120
int fraction = state.range(1);
119121
auto sparsity = 0.01;
120122
Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
123+
Tensor<double> matrix2 = loadRandomTensor("A", {dim, dim}, sparsity, f, 1 /* variant */);
121124
std::vector<int> indexSet;
122125
for (int i = 0; i < dim / fraction; i++) {
123126
indexSet.push_back(i * fraction);
@@ -126,9 +129,9 @@ static void bench_add_sparse_index_set(benchmark::State& state, const Format& f)
126129
for (auto _ : state) {
127130
// Setup.
128131
state.PauseTiming();
129-
Tensor<double> result("B", {dim, dim / fraction}, f);
132+
Tensor<double> result("B", {dim, dim / fraction}, f);
130133
IndexVar i("i"), j("j");
131-
result(i, j) = matrix(i, j(indexSet)) + matrix(i, j(indexSet));
134+
result(i, j) = matrix(i, j(indexSet)) + matrix2(i, j(indexSet));
132135
result.setAssembleWhileCompute(true);
133136
result.compile();
134137
state.ResumeTiming();

0 commit comments

Comments
 (0)