engl-2311-blog/assets/benchmarking-dwarfs/process-data.py

232 lines
8 KiB
Python

#!/usr/bin/env python3
import csv
import re
# a bunch of now-less-horrible code to make the chart.js code
class HelperFunctions:
def get_fs(dir):
if dir.endswith('dwarfs'):
return 'DwarFS'
elif dir.endswith('fuse-archive-tar'):
return 'fuse-archive (tar)'
return 'Btrfs'
def get_label(filename):
if filename == '25G-null.bin':
return 'Null 25 GiB file'
elif filename == '25G-random.bin':
return 'Random 25 GiB file'
elif filename == '100M-polygon.txt':
return '100 million-sided polygon data'
elif filename.startswith('kernel'):
return 'Linux LTS kernel'
elif filename == 'small-files/random':
return '1024 random files (avg)'
elif filename == 'small-files/null':
return '1024 null files (avg)'
def convert_time(time: str, unit: str) -> int:
unit_exponents = ['ns', 'µs', 'ms', 's']
if time.endswith('ms'):
current_unit = 'ms'
elif time.endswith('µs'):
current_unit = 'µs'
elif time.endswith('ns'):
current_unit = 'ns'
else:
current_unit = 's'
unit_multiplier = unit_exponents.index(current_unit) - unit_exponents.index(
unit
)
return HelperFunctions.time_num(time) * (1000**unit_multiplier)
def time_num(time: str):
time = re.sub('[^0-9\\.]', '', time)
return float(time)
def get_data(single_files_index: int, bulk_test_name: str):
skip_fuse_archive_tar = False
if bulk_test_name == 'bulk_random_read_latency':
skip_fuse_archive_tar = True
# format: { 'labels': ['btrfs'], 'btrfs': [9, 8, 4, 6]}
data = {'labels': []}
with open('assets/benchmarking-dwarfs/data/benchmark-data.csv', 'rt') as f:
for line in csv.reader(f):
fs = HelperFunctions.get_fs(line[0])
if fs == 'fuse-archive (tar)' and skip_fuse_archive_tar:
continue
label = HelperFunctions.get_label(line[1])
data['labels'].append(label) if label not in data[
'labels'
] else False
try:
data[fs].append(line[single_files_index])
except KeyError:
data[fs] = []
data[fs].append(line[single_files_index])
# NOTE: this will break if the bulk data contains a larger unit than the single file data, but that's unlikely to happen so I'm not gonna deal with it
# and it's a bit broken regardless but whatever
largest_time_unit = 'ns'
for key in data.keys():
if key == 'labels':
continue
for item in data[key]:
if largest_time_unit == 's':
break
if item.endswith('ms'):
largest_time_unit = 'ms'
elif item.endswith('µs') and largest_time_unit != 'ms':
largest_time_unit = 'µs'
elif (
item.endswith('ns')
and largest_time_unit != 'ms'
and largest_time_unit != 'µs'
):
largest_time_unit = 'ns'
elif re.sub('[0-9\\.]', '', item) == 's':
largest_time_unit = 's'
break
for key in data.keys():
if key == 'labels':
continue
for i in range(len(data[key])):
data[key][i] = HelperFunctions.convert_time(
data[key][i], largest_time_unit
)
with open('assets/benchmarking-dwarfs/data/bulk.csv', 'rt') as f:
for line in csv.reader(f):
if line[2] != bulk_test_name:
continue
fs = HelperFunctions.get_fs(line[0])
label = HelperFunctions.get_label(line[1])
data['labels'].append(label) if label not in data[
'labels'
] else False
for item in line[3:]:
# FIXME: this breaks if the bulk time is a larger unit than the single file time
if largest_time_unit == 's':
break
if item.endswith('ms'):
largest_time_unit = 'ms'
elif item.endswith('µs') and largest_time_unit != 'ms':
largest_time_unit = 'µs'
elif (
item.endswith('ns')
and largest_time_unit != 'ms'
and largest_time_unit != 'µs'
):
largest_time_unit = 'ns'
elif re.sub('[0-9]\\.', '', item) == 's':
largest_time_unit = 's'
break
# on the single file tests fuse-archive fails, and it's so small is shows as 0 here anyways, so might as well skip it
if fs == 'fuse-archive (tar)' and largest_time_unit == 's' and skip_fuse_archive_tar:
continue
for i in range(len(line[3:])):
line[i + 3] = HelperFunctions.convert_time(item, largest_time_unit)
try:
data[fs].append(sum(line[3:]) / len(line[3:]))
except KeyError:
data[fs] = [0, 0, 0, 0]
data[fs].append(sum(line[3:]) / len(line[3:]))
return (data, largest_time_unit)
def run(single_files_index: int, bulk_test_name: str, filename: str, title: str, chart_canvas_id: str):
with open(f'assets/benchmarking-dwarfs/js/{filename}', 'wt') as f:
# from https://github.com/chartjs/Chart.js/blob/master/docs/scripts/utils.js (CHART_COLORS)
# modified so similar color aren't adjacent
chart_colors = [
"'rgb(255, 99, 132)'", # red
"'rgb(75, 192, 192)'", # green
"'rgb(54, 162, 235)'", # blue
"'rgb(255, 159, 64)'", # orange
"'rgb(153, 102, 255)'", # purple
"'rgb(255, 205, 86)'", # yellow
"'rgb(201, 203, 207)'", # grey
]
labels_code = 'labels = $labels$'
dataset_code = '''
{
label: '$label$',
data: $data$,
backgroundColor: $color$,
},
'''
config_code = '''
config = {
type: 'bar',
data: {
datasets: data,
labels
},
options: {
plugins: {
title: {
display: true,
text: '$title$ - in $timeunit$'
},
},
responsive: true,
interaction: {
intersect: false,
},
}
};
'''
data, largest_time_unit = get_data(single_files_index, bulk_test_name)
labels_code = labels_code.replace('$labels$', format(data['labels']))
f.write(labels_code)
data.pop('labels')
f.write('\ndata = [')
for fs in data.keys():
f.write(
dataset_code.replace('$label$', fs)
.replace('$data$', format(data[fs]))
.replace('$color$', format(chart_colors[list(data.keys()).index(fs)]))
)
f.write('\n]\n')
f.write(
config_code.replace('$title$', title).replace(
'$timeunit$', largest_time_unit
)
)
f.write('\nChart.defaults.borderColor = "#eee"\n')
f.write('Chart.defaults.color = "#eee";\n')
f.write(f'ctx = document.getElementById("{chart_canvas_id}");\n')
f.write('new Chart(ctx, config);\n')
def declare_vars():
with open('assets/benchmarking-dwarfs/js/declare_vars.js', 'wt') as f:
f.write('let labels;\n')
f.write('let config;\n')
f.write('let data;\n')
f.write('let ctx;\n')
if __name__ == '__main__':
declare_vars()
run(2, 'bulk_sequential_read', 'seq_read.js', 'Sequential Read Times', 'seq_read_chart')
run(3, 'bulk_random_read', 'rand_read.js', 'Random Read Times', 'rand_read_chart')
run(4, 'bulk_sequential_read_latency', 'seq_latency.js', 'Sequential Read Latency', 'seq_read_latency_chart')
run(5, 'bulk_random_read_latency', 'rand_latency.js', 'Random Read Latency', 'rand_read_latency_chart')