Add in changes for imaging benchmark to test fused itteration lattice construction

weiya711 · weiya711 · commit 6e341fb8602b · 2021-04-07T01:18:52.000-07:00
diff --git a/numpy/image.py b/numpy/image.py
@@ -4,7 +4,7 @@
 import pytest
 import matplotlib.pyplot as plt 
 import sparse
-from util import ImagePydataSparseTensorLoader, safeCastPydataTensorToInts 
+from util import ImagePydataSparseTensorLoader, safeCastPydataTensorToInts, TnsFileDumper 
 
 
 # plot_image plots the given original, binned, xor, and sparse xor images
@@ -215,6 +215,51 @@ def dense_bench():
             return xor_img
 
         tacoBench(dense_bench)
+
+# USED FOR TESTING ITTERATION LATTICE CONSTRUCTION TACO CODE ONLY
+def testOp(a, b, c):
+    return np.logical_and(np.logical_not(np.logical_and(a, c).astype('int')).astype('int'), np.logical_not(np.logical_and(b, c).astype('int')).astype('int')).astype('int')
+@pytest.mark.skip(reason="Used for verification only")
+@pytest.mark.parametrize("num", list(range(1, 11))) 
+@pytest.mark.parametrize("pt1", [0.5])
+def bench_test_fused_pydata(tacoBench, num, pt1):
+        loader = ImagePydataSparseTensorLoader()
+        sparse_bin_img1 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1, 1))
+        sparse_bin_img2 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1+0.05, 2))
+        sparse_bin_window = loader.sparse_window(num, 3)
+        bin_img1 = loader.dense_image(num, pt1, 1) 
+        bin_img2 = loader.dense_image(num, pt1 + 0.05, 2)
+        bin_window = loader.dense_window(num)
+
+        def sparse_bench():
+            return testOp(sparse_bin_img1, sparse_bin_img2, sparse_bin_window).astype('int')
+
+        def dense_bench():
+            return testOp(bin_img1, bin_img2, bin_window).astype('int')
+
+        ret = tacoBench(sparse_bench)
+        sparse_xor_img = sparse_bench()
+        xor_img = dense_bench()
+
+        # Write result to TNS file to see what's different
+        shape = xor_img.shape
+        result = sparse.COO.from_numpy(xor_img, fill_value=0)
+        dok = sparse.DOK(result)
+        TnsFileDumper().dump_dict_to_file(shape, dok.data, os.path.join("temp", "numpy-result-{}.tns".format(num)))
+        
+    
+        num_elements = float(np.prod(bin_img1.shape))
+        f = sparse_xor_img.fill_value
+        print("shape1", sparse_bin_img1.shape)
+        print("shape2", sparse_bin_img2.shape)
+        print("sparse img1 nnz =", sparse_bin_img1.nnz, "    ", np.sum(bin_img1 != 0))
+        print("sparse img2 nnz =", sparse_bin_img2.nnz, "    ", np.sum(bin_img2 != 0))
+        print("sparse win nnz =", sparse_bin_window.nnz, "    ", np.sum(bin_window != 0))
+        print("Total num elements", num_elements)
+        print("Fill value", f)
+        print("Sparse xor NNF = ", sparse_xor_img.nnz, "\t", "Dense xor NNF = ", np.sum(xor_img != int(f)))
+        print("Dense xor NNZ = ", np.sum(xor_img != 0))
+        assert(sparse_xor_img.nnz == np.sum(xor_img != 1))
             
 if __name__=="__main__":
     main()
diff --git a/numpy/util.py b/numpy/util.py
@@ -44,7 +44,7 @@ class TnsFileDumper:
     def __init__(self):
         pass
 
-    def dump_dict_to_file(self, shape, data, path):
+    def dump_dict_to_file(self, shape, data, path, write_shape = False):
         # Sort the data so that the output is deterministic.
         sorted_data = sorted([list(coords) + [value] for coords, value in data.items()])
         with open(path, 'w+') as f:
@@ -53,9 +53,10 @@ def dump_dict_to_file(self, shape, data, path):
                 strings = coords + [str(line[-1])]
                 f.write(" ".join(strings))
                 f.write("\n")
-            shape_strings = [str(elem) for elem in shape] + ['0']
-            f.write(" ".join(shape_strings))
-            f.write("\n")
+            if write_shape:
+                shape_strings = [str(elem) for elem in shape] + ['0']
+                f.write(" ".join(shape_strings))
+                f.write("\n")
 
 # ScipySparseTensorLoader loads a sparse tensor from a file into a
 # scipy.sparse CSR matrix.
@@ -372,7 +373,8 @@ def sparse_image(self, num, pt, variant=None, path='no'):
             bin_img = self.dense_image(num, pt, variant, path)
             result = sparse.COO.from_numpy(bin_img)
             dok = sparse.DOK(result)
-            TnsFileDumper().dump_dict_to_file(self.shape[num], dok.data, key)
+            write_shape = bin_img.flat[-1] == 0
+            TnsFileDumper().dump_dict_to_file(self.shape[num], dok.data, key, write_shape)
             return result
 
     # sparse_window and dense_window must be called after the image calls
@@ -389,7 +391,8 @@ def sparse_window(self, num, variant=3):
             result_np = self.dense_window(num)
             result = sparse.COO.from_numpy(result_np)
             dok = sparse.DOK(result)
-            TnsFileDumper().dump_dict_to_file(shape, dok.data, key)
+            write_shape = result_np.flat[-1] == 0
+            TnsFileDumper().dump_dict_to_file(shape, dok.data, key, write_shape)
             return result
 
     def dense_window(self, num):
diff --git a/taco/bench.h b/taco/bench.h
@@ -74,6 +74,24 @@ taco::Tensor<T> castToType(std::string name, taco::Tensor<double> tensor) {
   return result;
 }
 
+template<typename T>
+taco::Tensor<T> castToTypeZero(std::string name, taco::Tensor<double> tensor) {
+  taco::Tensor<T> result(name, tensor.getDimensions(), tensor.getFormat());
+  std::vector<int> coords(tensor.getOrder());
+  for (auto& value : taco::iterate<double>(tensor)) {
+    for (int i = 0; i < tensor.getOrder(); i++) {
+      coords[i] = value.first[i];
+    }
+    // Attempt to cast the value to an integer. However, if the cast causes
+    // the value to equal 0, then this will ruin the sparsity pattern of the
+    // tensor, as the 0 values will get compressed out. So, if a cast would
+    // equal 0, insert 1 instead to preserve the sparsity pattern of the tensor.
+    result.insert(coords, static_cast<T>(value.second));
+  }
+  result.pack();
+  return result;
+}
+
 template<typename T, typename T2>
 taco::Tensor<T> shiftLastMode(std::string name, taco::Tensor<T2> original) {
   taco::Tensor<T> result(name, original.getDimensions(), original.getFormat());
diff --git a/taco/image.cpp b/taco/image.cpp
@@ -55,22 +55,31 @@ struct xorAndAlgebra {
   }
 };
 
+struct testConstructionAlgebra {
+  IterationAlgebra operator()(const std::vector<IndexExpr>& regions) {
+    auto m1 = Union(Complement(regions[0]), Complement(regions[2]));
+    auto m2 = Union(Complement(regions[1]), Complement(regions[2]));
+    return Intersect(m1, m2);
+  }
+};
+
 Func xorOp1("logical_xor", Boolean(), xorAlgebra());
 Func andOp1("logical_and", Boolean(), andAlgebra());
 Func xorAndOp("fused_xor_and", Boolean(), xorAndAlgebra());
+Func testOp("test", Boolean(), testConstructionAlgebra());
 static void bench_image_xor(benchmark::State& state, const Format& f) {
   int num = state.range(0);
   auto t1 = 0.5;
   auto t2 = 0.55;
-  Tensor<int64_t> matrix1 = castToType<int64_t>("A", loadImageTensor("A", num, f, t1, 1 /* variant */));
-  Tensor<int64_t> matrix2 = castToType<int64_t>("B", loadImageTensor("B", num, f, t2, 2 /* variant */));
+  Tensor<int64_t> matrix1 = castToTypeZero<int64_t>("A", loadImageTensor("A", num, f, t1, 1 /* variant */));
+  Tensor<int64_t> matrix2 = castToTypeZero<int64_t>("B", loadImageTensor("B", num, f, t2, 2 /* variant */));
   auto dims = matrix1.getDimensions();
 
   for (auto _ : state) {
     state.PauseTiming();
     Tensor<int64_t> result("result", dims, f, 1);
     IndexVar i("i"), j("j");
-    result(i, j) = xorOp1(matrix1(i, j), matrix2(i, j));
+    result(i, j) = testOp(matrix1(i, j), matrix2(i, j));
     result.setAssembleWhileCompute(true);
     result.compile();
     state.ResumeTiming();
@@ -86,7 +95,7 @@ static void bench_image_xor(benchmark::State& state, const Format& f) {
   }
 }
 static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 1; i <= 1; ++i)
+  for (int i = 1; i <= 11; ++i)
       b->Args({i});
 }
 TACO_BENCH_ARGS(bench_image_xor, csr, CSR)->Apply(CustomArguments);
@@ -95,11 +104,14 @@ static void bench_image_fused(benchmark::State& state, const Format& f) {
   int num = state.range(0);
   auto t1 = 0.5;
   auto t2 = 0.55;
-  Tensor<int64_t> matrix1 = castToType<int64_t>("A", loadImageTensor("A", num, f, t1, 1 /* variant */));
-  Tensor<int64_t> matrix2 = castToType<int64_t>("B", loadImageTensor("B", num, f, t2, 2 /* variant */));
-  Tensor<int64_t> matrix3 = castToType<int64_t>("C", loadImageTensor("C", num, f, 3 /* variant */));
+  Tensor<int64_t> matrix1 = castToTypeZero<int64_t>("A", loadImageTensor("A", num, f, t1, 1 /* variant */));
+  Tensor<int64_t> matrix2 = castToTypeZero<int64_t>("B", loadImageTensor("B", num, f, t2, 2 /* variant */));
+  Tensor<int64_t> matrix3 = castToTypeZero<int64_t>("C", loadImageTensor("C", num, f, 3 /* variant */));
   auto dims = matrix1.getDimensions();
 
+//  write("temp/taco-mat1-" + std::to_string(num) + ".tns", matrix1);
+//  write("temp/taco-mat2-" + std::to_string(num) + ".tns", matrix2);
+//  write("temp/taco-mat3-" + std::to_string(num) + ".tns", matrix3);
   int nnz = 0;
   for (auto& it : iterate<int64_t>(matrix1)) {
     nnz++;
@@ -115,35 +127,31 @@ static void bench_image_fused(benchmark::State& state, const Format& f) {
     nnz++;
   }
   std::cout << "Matrix3 NNZ = " << nnz << std::endl;
+
   for (auto _ : state) {
     state.PauseTiming();
     Tensor<int64_t> result("result", dims, f, 0);
-    Tensor<int64_t> temp1("t1", dims, f, 0);
-    Tensor<int64_t> temp2("t2", dims, f, 0);
+
     IndexVar i("i"), j("j");
-//    temp1(i,j) = andOp1(matrix1(i, j), matrix3(i, j));
-//    temp2(i,j) = andOp1(matrix2(i, j), matrix3(i, j));
-//    result(i, j) = xorOp1(temp1(i,j), temp2(i,j));
-//    result(i, j) = xorOp1(andOp1(matrix1(i, j), matrix3(i, j)), andOp1(matrix2(i, j), matrix3(i, j)));
-    result(i, j) = xorAndOp(matrix1(i, j), matrix2(i, j), matrix3(i, j));
+    result(i, j) = testOp(matrix1(i, j), matrix2(i, j), matrix3(i, j));
     IndexStmt stmt = result.getAssignment().concretize();
     result.setAssembleWhileCompute(true);
     result.compile();
     state.ResumeTiming();
     result.compute();
-    temp1 = temp1.removeExplicitZeros(temp1.getFormat());
-    temp2 = temp2.removeExplicitZeros(temp2.getFormat());
     result = result.removeExplicitZeros(result.getFormat());
     int nnz = 0;
     for (auto& it : iterate<int64_t>(result)) {
       nnz++;
     }
 
     std::cout << "Result NNZ = " << nnz << std::endl;
-      std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(std::cout, ir::CodeGen::ImplementationGen);
-      ir::Stmt compute = lower(stmt, "compute",  false, true);
-      codegen->compile(compute, true);
-//    std::cout << result << std::endl;
+//    write("temp/taco-result" + std::to_string(num) + ".tns", result);
+    // Used to print out generated TACO code
+//    std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(std::cout, ir::CodeGen::ImplementationGen);
+//    ir::Stmt compute = lower(stmt, "compute",  false, true);
+//    codegen->compile(compute, true);
   }
 }
+TACO_BENCH_ARGS(bench_image_fused, csr, CSR)->Apply(CustomArguments);
 TACO_BENCH_ARGS(bench_image_fused, csr, CSR)->Apply(CustomArguments);