Merge pull request #24 from tensor-compiler/variants-and-fusing

rohany · web-flow · commit 7b29fe9740c6 · 2021-03-24T11:35:39.000-07:00
numpy,taco: updates to windowing benchmarks, add a fusing benchmark
diff --git a/numpy/ufuncs.py b/numpy/ufuncs.py
@@ -3,7 +3,7 @@
 import sparse
 import pytest
 import os
-from util import TensorCollectionFROSTT, PydataTensorShifter, TensorCollectionSuiteSparse, ScipyTensorShifter, PydataMatrixMarketTensorLoader, ScipyMatrixMarketTensorLoader, VALIDATION_OUTPUT_PATH, PydataSparseTensorDumper, SuiteSparseTensor, safeCastPydataTensorToInts
+from util import TensorCollectionFROSTT, PydataTensorShifter, TensorCollectionSuiteSparse, ScipyTensorShifter, PydataMatrixMarketTensorLoader, ScipyMatrixMarketTensorLoader, VALIDATION_OUTPUT_PATH, PydataSparseTensorDumper, SuiteSparseTensor, safeCastPydataTensorToInts, RandomPydataSparseTensorLoader
 
 # TODO (rohany): Ask hameer about this. pydata/sparse isn't happy when
 #  given this ufunc to evaluate.
@@ -34,6 +34,16 @@ def bench():
         C = ufunc(A, B)
     tacoBench(bench)
 
+@pytest.mark.parametrize("dim", [5000, 10000, 20000])
+def bench_pydata_ufunc_fused(tacoBench, dim):
+    loader = RandomPydataSparseTensorLoader()
+    matrix = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01))
+    matrix1 = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01, variant=1))
+    matrix2 = safeCastPydataTensorToInts(loader.random((dim, dim), 0.01, variant=2))
+    def bench():
+        result = numpy.logical_and(numpy.logical_xor(matrix, matrix1), matrix2)
+        return result
+    tacoBench(bench)
 
 def import_tensor(filename, dim):
     print(filename)
diff --git a/numpy/util.py b/numpy/util.py
@@ -92,10 +92,13 @@ def dump(self, tensor, path):
 # The key itself is formatted by the dimensions, followed by the
 # sparsity. For example, a 250 by 250 tensor with sparsity 0.01
 # would have a key of 250x250-0.01.tns.
-def construct_random_tensor_key(shape, sparsity):
+def construct_random_tensor_key(shape, sparsity, variant):
     path = TENSOR_PATH
     dims = "x".join([str(dim) for dim in shape])
-    key = "{}-{}.tns".format(dims, sparsity)
+    if variant is None:
+        key = "{}-{}.tns".format(dims, sparsity)
+    else:
+        key = "{}-{}-{}.tns".format(dims, sparsity, variant)
     return os.path.join(path, "random", key)
 
 # RandomPydataSparseTensorLoader should be used to generate
@@ -106,8 +109,8 @@ class RandomPydataSparseTensorLoader:
     def __init__(self):
         self.loader = PydataSparseTensorLoader()
 
-    def random(self, shape, sparsity):
-        key = construct_random_tensor_key(shape, sparsity)
+    def random(self, shape, sparsity, variant=None):
+        key = construct_random_tensor_key(shape, sparsity, variant)
         # If a tensor with these properties exists already, then load it.
         if os.path.exists(key):
             return self.loader.load(key)
@@ -126,9 +129,9 @@ def __init__(self, format):
         self.loader = ScipySparseTensorLoader(format)
         self.format = format
 
-    def random(self, shape, sparsity):
+    def random(self, shape, sparsity, variant=None):
         assert(len(shape) == 2)
-        key = construct_random_tensor_key(shape, sparsity)
+        key = construct_random_tensor_key(shape, sparsity, variant)
         # If a tensor with these properties exists already, then load it.
         if os.path.exists(key):
             return self.loader.load(key)
diff --git a/numpy/windowing.py b/numpy/windowing.py
@@ -14,7 +14,6 @@
 sizeConfigs = ["Constant", "ConstantFraction", "AlmostWhole", "Whole", "NoWindowing"]
 
 def sliceTensor(tensor, dim, config):
-    return tensor
     if config == "Constant":
         return tensor[250:750, 250:750]
     elif config == "ConstantFraction":
@@ -36,9 +35,11 @@ def sliceTensor(tensor, dim, config):
 def bench_add_sparse_window(tacoBench, dim, format, config):
     loader = RandomScipySparseTensorLoader(format)
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = sliceTensor(matrix, dim, config)
-        res = x + x
+        x2 = sliceTensor(matrix2, dim, config)
+        res = x + x2
         # Sanity check that this has a similar runtime as taco.
         # res = matrix + matrix
     tacoBench(bench)
@@ -50,22 +51,24 @@ def bench():
 def bench_add_pydata_sparse_window(tacoBench, dim, config):
     loader = RandomPydataSparseTensorLoader()
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = sliceTensor(matrix, dim, config)
-        res = x + x
+        x2 = sliceTensor(matrix2, dim, config)
+        res = x + x2
     tacoBench(bench)
 
-# TODO (rohany): Parametrize the below tests by appropriate windowing config.
-
 @pytest.mark.parametrize("dim", [5000, 10000, 20000])
 @pytest.mark.parametrize("format", ['csr', 'csc'])
 @pytest.mark.parametrize("strideWidth", [2, 4, 8])
 def bench_add_sparse_strided_window(tacoBench, dim, format, strideWidth):
     loader = RandomScipySparseTensorLoader(format)
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = matrix[0:dim:strideWidth, 0:dim:strideWidth] 
-        res = x + x
+        x2 = matrix2[0:dim:strideWidth, 0:dim:strideWidth] 
+        res = x + x2
     tacoBench(bench)
 
 @pytest.mark.parametrize("dim", [5000, 10000, 20000])
@@ -75,19 +78,23 @@ def bench_add_sparse_index_set(tacoBench, dim, format, fraction):
     indexes = [i * fraction for i in range(0, dim//fraction)]
     loader = RandomScipySparseTensorLoader(format)
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = matrix[:, indexes] 
-        res = x + x
+        x2 = matrix2[:, indexes] 
+        res = x + x2
     tacoBench(bench)
 
 @pytest.mark.parametrize("dim", [5000, 10000, 20000])
 @pytest.mark.parametrize("strideWidth", [2, 4, 8])
 def bench_add_pydata_sparse_strided_window(tacoBench, dim, strideWidth):
     loader = RandomPydataSparseTensorLoader()
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = matrix[0:dim:strideWidth, 0:dim:strideWidth] 
-        res = x + x
+        x2 = matrix[0:dim:strideWidth, 0:dim:strideWidth] 
+        res = x + x2
     tacoBench(bench)
 
 # TODO (rohany): This is really slow (compared to scipy.sparse). Check with hameer
@@ -98,12 +105,13 @@ def bench_add_pydata_sparse_index_set(tacoBench, dim, fraction):
     loader = RandomPydataSparseTensorLoader()
     indexes = [i * fraction for i in range(0, dim//fraction)]
     matrix = loader.random((dim, dim), 0.01)
+    matrix2 = loader.random((dim, dim), 0.01, variant=1)
     def bench():
         x = matrix[:, indexes] 
-        res = x + x
+        x = matrix2[:, indexes] 
+        res = x + x2
     tacoBench(bench)
 
-# TODO (rohany): I don't know if we care about this benchmark.
 @pytest.mark.parametrize("dim", [5000, 10000, 20000])
 @pytest.mark.parametrize("format", ['csr', 'csc'])
 @pytest.mark.skip(reason="not using currently")
diff --git a/taco/bench.cpp b/taco/bench.cpp
@@ -38,22 +38,26 @@ std::string cleanPath(std::string path) {
   return result;
 }
 
-std::string constructRandomTensorKey(std::vector<int> dims, float sparsity) {
+std::string constructRandomTensorKey(std::vector<int> dims, float sparsity, int variant) {
   auto path = getTacoTensorPath();
   std::stringstream result;
   result << path;
   if (path[path.size() - 1] != '/') {
     result << "/";
   }
   result << "random/";
-  result << taco::util::join(dims, "x") << "-" << sparsity << ".tns";
+  if (variant == 0) {
+    result << taco::util::join(dims, "x") << "-" << sparsity << ".tns";
+  } else {
+    result << taco::util::join(dims, "x") << "-" << sparsity << "-" << variant << ".tns";
+  }
   return result.str();
 }
 
-taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format) {
+taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format, int variant) {
   // For now, just say that the python code must generate the random
   // tensor before use.
-  auto tensor = taco::read(constructRandomTensorKey(dims, sparsity), format, true);
+  auto tensor = taco::read(constructRandomTensorKey(dims, sparsity, variant), format, true);
   tensor.setName(name);
   return tensor;
 }
diff --git a/taco/bench.h b/taco/bench.h
@@ -49,7 +49,7 @@ std::string getTacoTensorPath();
 std::string getValidationOutputPath();
 // cleanPath ensures that the input path ends with "/".
 std::string cleanPath(std::string path);
-taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format);
+taco::TensorBase loadRandomTensor(std::string name, std::vector<int> dims, float sparsity, taco::Format format, int variant=0);
 
 template<typename T>
 taco::Tensor<T> castToType(std::string name, taco::Tensor<double> tensor) {
diff --git a/taco/ufuncs.cpp b/taco/ufuncs.cpp
@@ -33,6 +33,12 @@ struct xorAlgebra {
   }
 };
 
+struct andAlgebra {
+  IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
+    return Intersect(regions[0], regions[1]);
+  }
+};
+
 struct RightShift{
   ir::Expr operator()(const std::vector<ir::Expr> &v) {
     if (v.size() == 1)
@@ -155,6 +161,33 @@ static void applyBenchSizes(benchmark::internal::Benchmark* b) {
 TACO_BENCH_ARGS(bench_ufunc_sparse, xor_0.01, 0.01, "xor")->Apply(applyBenchSizes);
 TACO_BENCH_ARGS(bench_ufunc_sparse, rightShift_0.01, 0.01, ">>")->Apply(applyBenchSizes);
 
+Func ldExp("ldexp", Ldexp(), leftIncAlgebra());
+Func rightShift("right_shift", RightShift(), leftIncAlgebra());
+Func xorOp("logical_xor", GeneralAdd(), xorAlgebra());
+Func andOp("logical_and", GeneralAdd(), andAlgebra());
+
+static void bench_ufunc_fused(benchmark::State& state, const Format& f) {
+  int dim = state.range(0);
+  auto sparsity = 0.01;
+  Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
+  Tensor<double> matrix1 = loadRandomTensor("B", {dim, dim}, sparsity, f, 1 /* variant */);
+  Tensor<double> matrix2 = loadRandomTensor("C", {dim, dim}, sparsity, f, 2 /* variant */);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    Tensor<double> result("result", {dim, dim}, f);
+    IndexVar i("i"), j("j");
+    result(i, j) = andOp(xorOp(matrix(i, j), matrix1(i, j)), matrix2(i, j));
+    result.setAssembleWhileCompute(true);
+    result.compile();
+    state.ResumeTiming();
+
+    result.compute();
+  }
+}
+TACO_BENCH_ARGS(bench_ufunc_fused, csr, CSR)
+  ->ArgsProduct({{5000, 10000, 20000}});
+
 // UfuncInputCache is a cache for the input to ufunc benchmarks. These benchmarks
 // operate on a tensor loaded from disk and the same tensor shifted slightly. Since
 // these operations are run multiple times, we can save alot in benchmark startup
@@ -242,10 +275,6 @@ static void bench_frostt_ufunc(benchmark::State& state, std::string tnsPath, Fun
   }
 }
 
-Func ldExp("ldexp", Ldexp(), leftIncAlgebra());
-Func rightShift("right_shift", RightShift(), leftIncAlgebra());
-Func xorOp("logical_xor", GeneralAdd(), xorAlgebra());
-
 #define FOREACH_FROSTT_TENSOR(__func__) \
   __func__(nips, "nips.tns") \
   __func__(uber-pickups, "uber-pickups.tns") \
diff --git a/taco/windowing.cpp b/taco/windowing.cpp
@@ -28,34 +28,34 @@ enum WindowConfig {
   __func__(Whole, Whole) \
   __func__(NoWindowing, NoWindowing)
 
-Tensor<double> windowedTensorOp(Tensor<double> input, int dim, WindowConfig config) {
+Tensor<double> windowedTensorOp(Tensor<double> input1, Tensor<double> input2, int dim, WindowConfig config) {
   IndexVar i, j;
   switch (config) {
     case Constant: {
-      Tensor<double> result("B", {500, 500}, input.getFormat());
-      result(i, j) = input(i(250, 750), j(250, 750)) + input(i(250, 750), j(250, 750));
+      Tensor<double> result("B", {500, 500}, input1.getFormat());
+      result(i, j) = input1(i(250, 750), j(250, 750)) + input2(i(250, 750), j(250, 750));
       return result;
     }
     case ConstantFraction: {
       int size = dim / 4;
       int start = dim / 4;
-      Tensor<double> result("B", {size, size}, input.getFormat());
-      result(i, j) = input(i(start, start + size), j(start, start + size)) + input(i(start, start + size), j(start, start + size));
+      Tensor<double> result("B", {size, size}, input1.getFormat());
+      result(i, j) = input1(i(start, start + size), j(start, start + size)) + input2(i(start, start + size), j(start, start + size));
       return result;
     }
     case AlmostWhole: {
-      Tensor<double> result("B", {dim-2, dim-2}, input.getFormat());
-      result(i, j) = input(i(1, dim-1), j(1, dim-1)) + input(i(1, dim-1), j(1, dim-1));
+      Tensor<double> result("B", {dim-2, dim-2}, input1.getFormat());
+      result(i, j) = input1(i(1, dim-1), j(1, dim-1)) + input2(i(1, dim-1), j(1, dim-1));
       return result;
     }
     case Whole: {
-      Tensor<double> result("B", {dim, dim}, input.getFormat());
-      result(i, j) = input(i(0, dim), j(0, dim)) + input(i(0, dim), j(0, dim));
+      Tensor<double> result("B", {dim, dim}, input1.getFormat());
+      result(i, j) = input1(i(0, dim), j(0, dim)) + input2(i(0, dim), j(0, dim));
       return result;
     }
     case NoWindowing: {
-      Tensor<double> result("B", {dim, dim}, input.getFormat());
-      result(i, j) = input(i, j) + input(i, j);
+      Tensor<double> result("B", {dim, dim}, input1.getFormat());
+      result(i, j) = input1(i, j) + input2(i, j);
       return result;
     }
     default:
@@ -67,12 +67,13 @@ static void bench_add_sparse_window(benchmark::State& state, const Format& f, Wi
   int dim = state.range(0);
   auto sparsity = 0.01;
   Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
+  Tensor<double> matrix2 = loadRandomTensor("A2", {dim, dim}, sparsity, f, 1 /* variant */);
   matrix.pack();
 
   for (auto _ : state) {
     // Setup.
     state.PauseTiming();
-    auto result = windowedTensorOp(matrix, dim, config);
+    auto result = windowedTensorOp(matrix, matrix2, dim, config);
     result.setAssembleWhileCompute(true);
     result.compile();
     state.ResumeTiming();
@@ -92,14 +93,15 @@ static void bench_add_sparse_strided_window(benchmark::State& state, const Forma
   int strideWidth = state.range(1);
   auto sparsity = 0.01;
   Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
+  Tensor<double> matrix2 = loadRandomTensor("A2", {dim, dim}, sparsity, f, 1 /* variant */);
   matrix.pack();
 
   for (auto _ : state) {
     // Setup.
     state.PauseTiming();
     Tensor<double> result("B", {dim/strideWidth, dim/strideWidth}, f);
     IndexVar i, j;
-    result(i, j) = matrix(i(0, dim, strideWidth), j(0, dim, strideWidth)) + matrix(i(0, dim, strideWidth), j(0, dim, strideWidth));
+    result(i, j) = matrix(i(0, dim, strideWidth), j(0, dim, strideWidth)) + matrix2(i(0, dim, strideWidth), j(0, dim, strideWidth));
     result.setAssembleWhileCompute(true);
     result.compile();
     state.ResumeTiming();
@@ -118,6 +120,7 @@ static void bench_add_sparse_index_set(benchmark::State& state, const Format& f)
   int fraction = state.range(1);
   auto sparsity = 0.01;
   Tensor<double> matrix = loadRandomTensor("A", {dim, dim}, sparsity, f);
+  Tensor<double> matrix2 = loadRandomTensor("A", {dim, dim}, sparsity, f, 1 /* variant */);
   std::vector<int> indexSet;
   for (int i = 0; i < dim / fraction; i++) {
     indexSet.push_back(i * fraction);
@@ -126,9 +129,9 @@ static void bench_add_sparse_index_set(benchmark::State& state, const Format& f)
   for (auto _ : state) {
     // Setup.
     state.PauseTiming();
-     Tensor<double> result("B", {dim, dim / fraction}, f);
+    Tensor<double> result("B", {dim, dim / fraction}, f);
     IndexVar i("i"), j("j");
-    result(i, j) = matrix(i, j(indexSet)) + matrix(i, j(indexSet));
+    result(i, j) = matrix(i, j(indexSet)) + matrix2(i, j(indexSet));
     result.setAssembleWhileCompute(true);
     result.compile();
     state.ResumeTiming();