scripts: add script that aggregates separate CSV's into single CSV

Rohan Yadav · Rohan Yadav · commit ba064c588361 · 2021-03-18T01:28:38.000-04:00
diff --git a/scripts/taco_bench_aggregator.py b/scripts/taco_bench_aggregator.py
@@ -0,0 +1,56 @@
+import csv
+import os
+import glob
+import sys
+import tqdm
+import argparse
+
+def aggregateTacoBenches(folder, outfile, labelSet=None):
+    first = True
+    outputFile = open(outfile, 'w+')
+    writer = csv.writer(outputFile, delimiter=',')
+    # If labelSet is not None, we're going to collect all of the
+    # valid labels we've seen, and output them to a file later.
+    validLabels = set()
+    # Loop through all files with a csv extension.
+    with open(outfile, 'w+') as outputFile:
+        writer = csv.writer(outputFile, delimiter=',')
+        for fname in tqdm.tqdm(glob.glob(os.path.join(folder, "*.csv"))):
+            # Open up the file.
+            with open(fname, 'r') as f:
+                # Discard the first 10 lines. This corresponds to the
+                # google-benchmark generated header.
+                for i in range(0, 10):
+                    f.readline()
+                # Open the rest of the file as a CSV.
+                reader = csv.reader(f)
+                # Attempt to read the header from CSV. If this fails,
+                # the benchmark might have failed in the middle. So,
+                # just continue on to the next file.
+                try:
+                    header = next(reader)
+                except Exception as e:
+                    continue
+                # Find the column that contains label. We're going to skip
+                # entries that have a skip marker in the label.
+                labelIdx = header.index("label")
+                if first:
+                    writer.writerow(header)
+                    first = False
+                for row in reader:
+                    if "SKIPPED" not in row[labelIdx]:
+                        validLabels.add(row[labelIdx])
+                        writer.writerow(row)
+    # Write out the set of valid labels.
+    if labelSet is not None:
+        with open(labelSet, 'w+') as validSet:
+            for l in validLabels:
+                validSet.write(l)
+                validSet.write("\n")
+
+parser = argparse.ArgumentParser()
+parser.add_argument('target_directory', type=str, help="Directory containing CSV's to aggregate")
+parser.add_argument('output_csv_name', type=str, help="Name of the CSV to generate")
+parser.add_argument('--label_set_file', type=str, default=None, help='Set to output all valid labels seen to a file')
+args = parser.parse_args()
+aggregateTacoBenches(args.target_directory, args.output_csv_name, labelSet=args.label_set_file)