engl-2311-blog/assets/benchmarking-dwarfs/process-data.py

#!/usr/bin/env python3
import csv
import re


# a bunch of now-less-horrible code to make the chart.js code


class HelperFunctions:
    def get_fs(dir):
        if dir.endswith('dwarfs'):
            return 'DwarFS'
        elif dir.endswith('fuse-archive-tar'):
            return 'fuse-archive (tar)'

        return 'Btrfs'

    def get_label(filename):
        if filename == '25G-null.bin':
            return 'Null 25 GiB file'
        elif filename == '25G-random.bin':
            return 'Random 25 GiB file'
        elif filename == '100M-polygon.txt':
            return '100 million-sided polygon data'
        elif filename.startswith('kernel'):
            return 'Linux LTS kernel'
        elif filename == 'small-files/random':
            return '1024 random files (avg)'
        elif filename == 'small-files/null':
            return '1024 null files (avg)'

    def convert_time(time: str, unit: str) -> int:
        unit_exponents = ['ns', 'µs', 'ms', 's']

        if time.endswith('ms'):
            current_unit = 'ms'
        elif time.endswith('µs'):
            current_unit = 'µs'
        elif time.endswith('ns'):
            current_unit = 'ns'
        else:
            current_unit = 's'

        unit_multiplier = unit_exponents.index(current_unit) - unit_exponents.index(
            unit
        )
        return HelperFunctions.time_num(time) * (1000**unit_multiplier)

    def time_num(time: str):
        time = re.sub('[^0-9\\.]', '', time)
        return float(time)

def get_data(single_files_index: int, bulk_test_name: str):
    skip_fuse_archive_tar = False
    if bulk_test_name == 'bulk_random_read_latency':
        skip_fuse_archive_tar = True
    # format: { 'labels': ['btrfs'], 'btrfs': [9, 8, 4, 6]}
    data = {'labels': []}
    with open('assets/benchmarking-dwarfs/data/benchmark-data.csv', 'rt') as f:
        for line in csv.reader(f):
            fs = HelperFunctions.get_fs(line[0])
            if fs == 'fuse-archive (tar)' and skip_fuse_archive_tar:
                continue
            label = HelperFunctions.get_label(line[1])
            data['labels'].append(label) if label not in data[
                'labels'
            ] else False
            try:
                data[fs].append(line[single_files_index])
            except KeyError:
                data[fs] = []
                data[fs].append(line[single_files_index])

    # NOTE: this will break if the bulk data contains a larger unit than the single file data, but that's unlikely to happen so I'm not gonna deal with it
    # and it's a bit broken regardless but whatever
    largest_time_unit = 'ns'
    for key in data.keys():
        if key == 'labels':
            continue
        for item in data[key]:
            if largest_time_unit == 's':
                break
            if item.endswith('ms'):
                largest_time_unit = 'ms'
            elif item.endswith('µs') and largest_time_unit != 'ms':
                largest_time_unit = 'µs'
            elif (
                item.endswith('ns')
                and largest_time_unit != 'ms'
                and largest_time_unit != 'µs'
            ):
                largest_time_unit = 'ns'
            elif re.sub('[0-9\\.]', '', item) == 's':
                largest_time_unit = 's'
                break

    for key in data.keys():
        if key == 'labels':
            continue
        for i in range(len(data[key])):
            data[key][i] = HelperFunctions.convert_time(
                data[key][i], largest_time_unit
            )

    with open('assets/benchmarking-dwarfs/data/bulk.csv', 'rt') as f:
        for line in csv.reader(f):
            if line[2] != bulk_test_name:
                continue
            fs = HelperFunctions.get_fs(line[0])
            label = HelperFunctions.get_label(line[1])
            data['labels'].append(label) if label not in data[
                'labels'
            ] else False

            for item in line[3:]:
                # FIXME: this breaks if the bulk time is a larger unit than the single file time
                if largest_time_unit == 's':
                    break
                if item.endswith('ms'):
                    largest_time_unit = 'ms'
                elif item.endswith('µs') and largest_time_unit != 'ms':
                    largest_time_unit = 'µs'
                elif (
                    item.endswith('ns')
                    and largest_time_unit != 'ms'
                    and largest_time_unit != 'µs'
                ):
                    largest_time_unit = 'ns'
                elif re.sub('[0-9]\\.', '', item) == 's':
                    largest_time_unit = 's'
                    break
            
            # on the single file tests fuse-archive fails, and it's so small is shows as 0 here anyways, so might as well skip it
            if fs == 'fuse-archive (tar)' and largest_time_unit == 's' and skip_fuse_archive_tar:
                continue

            for i in range(len(line[3:])):
                line[i + 3] = HelperFunctions.convert_time(item, largest_time_unit)

            try:
                data[fs].append(sum(line[3:]) / len(line[3:]))
            except KeyError:
                data[fs] = [0, 0, 0, 0]
                data[fs].append(sum(line[3:]) / len(line[3:]))

            
        return (data, largest_time_unit)


def run(single_files_index: int, bulk_test_name: str, filename: str, title: str, chart_canvas_id: str):
    with open(f'assets/benchmarking-dwarfs/js/{filename}', 'wt') as f:
        # from https://github.com/chartjs/Chart.js/blob/master/docs/scripts/utils.js (CHART_COLORS)
        # modified so similar color aren't adjacent
        chart_colors = [
            "'rgb(255, 99, 132)'",  # red
            "'rgb(75, 192, 192)'",  # green
            "'rgb(54, 162, 235)'",  # blue
            "'rgb(255, 159, 64)'",  # orange
            "'rgb(153, 102, 255)'",  # purple
            "'rgb(255, 205, 86)'",  # yellow
            "'rgb(201, 203, 207)'",  # grey
        ]

        labels_code = 'labels = $labels$'
        dataset_code = '''
        {
        label: '$label$',
        data: $data$,
        backgroundColor: $color$,
        },
        '''

        config_code = '''
    config = {
        type: 'bar',
        data: {
            datasets: data,
            labels
        },
        options: {
        plugins: {
            title: {
            display: true,
            text: '$title$ - in $timeunit$'
            },
        },
        responsive: true,
        interaction: {
            intersect: false,
        },
        }
    };
    '''

        data, largest_time_unit = get_data(single_files_index, bulk_test_name)
        labels_code = labels_code.replace('$labels$', format(data['labels']))
        f.write(labels_code)
        data.pop('labels')
        f.write('\ndata = [')
        for fs in data.keys():
            f.write(
                dataset_code.replace('$label$', fs)
                .replace('$data$', format(data[fs]))
                .replace('$color$', format(chart_colors[list(data.keys()).index(fs)]))
            )
        f.write('\n]\n')

        f.write(
            config_code.replace('$title$', title).replace(
                '$timeunit$', largest_time_unit
            )
        )

        f.write('\nChart.defaults.borderColor = "#eee"\n')
        f.write('Chart.defaults.color = "#eee";\n')
        f.write(f'ctx = document.getElementById("{chart_canvas_id}");\n')
        f.write('new Chart(ctx, config);\n')

def declare_vars():
    with open('assets/benchmarking-dwarfs/js/declare_vars.js', 'wt') as f:
        f.write('let labels;\n')
        f.write('let config;\n')
        f.write('let data;\n')
        f.write('let ctx;\n')

if __name__ == '__main__':
    declare_vars()
    run(2, 'bulk_sequential_read', 'seq_read.js', 'Sequential Read Times', 'seq_read_chart')
    run(3, 'bulk_random_read', 'rand_read.js', 'Random Read Times', 'rand_read_chart')
    run(4, 'bulk_sequential_read_latency', 'seq_latency.js', 'Sequential Read Latency', 'seq_read_latency_chart')
    run(5, 'bulk_random_read_latency', 'rand_latency.js', 'Random Read Latency', 'rand_read_latency_chart')
Add a graphs and stuff to benchmarking-dwarfs - not done, but progress 2024-11-17 00:47:41 -06:00			`#!/usr/bin/env python3`
			`import csv`
			`import re`

fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
update comments now that this is less horrible 2024-11-18 14:47:27 -06:00			`# a bunch of now-less-horrible code to make the chart.js code`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00

Add a graphs and stuff to benchmarking-dwarfs - not done, but progress 2024-11-17 00:47:41 -06:00			`class HelperFunctions:`
			`def get_fs(dir):`
			`if dir.endswith('dwarfs'):`
			`return 'DwarFS'`
			`elif dir.endswith('fuse-archive-tar'):`
			`return 'fuse-archive (tar)'`

			`return 'Btrfs'`

			`def get_label(filename):`
			`if filename == '25G-null.bin':`
			`return 'Null 25 GiB file'`
			`elif filename == '25G-random.bin':`
			`return 'Random 25 GiB file'`
			`elif filename == '100M-polygon.txt':`
			`return '100 million-sided polygon data'`
			`elif filename.startswith('kernel'):`
			`return 'Linux LTS kernel'`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00			`elif filename == 'small-files/random':`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`return '1024 random files (avg)'`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00			`elif filename == 'small-files/null':`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`return '1024 null files (avg)'`
Add a graphs and stuff to benchmarking-dwarfs - not done, but progress 2024-11-17 00:47:41 -06:00
			`def convert_time(time: str, unit: str) -> int:`
			`unit_exponents = ['ns', 'µs', 'ms', 's']`

			`if time.endswith('ms'):`
			`current_unit = 'ms'`
			`elif time.endswith('µs'):`
			`current_unit = 'µs'`
			`elif time.endswith('ns'):`
			`current_unit = 'ns'`
			`else:`
			`current_unit = 's'`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
			`unit_multiplier = unit_exponents.index(current_unit) - unit_exponents.index(`
			`unit`
			`)`
			`return HelperFunctions.time_num(time) * (1000**unit_multiplier)`

			`def time_num(time: str):`
			`time = re.sub('[^0-9\\.]', '', time)`
Add a graphs and stuff to benchmarking-dwarfs - not done, but progress 2024-11-17 00:47:41 -06:00			`return float(time)`

DRY 2024-11-18 09:35:14 -06:00			`def get_data(single_files_index: int, bulk_test_name: str):`
fuse-archive now no longer shows up when unnecessary 2024-11-18 19:10:43 -06:00			`skip_fuse_archive_tar = False`
			`if bulk_test_name == 'bulk_random_read_latency':`
			`skip_fuse_archive_tar = True`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`# format: { 'labels': ['btrfs'], 'btrfs': [9, 8, 4, 6]}`
DRY 2024-11-18 09:35:14 -06:00			`data = {'labels': []}`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`with open('assets/benchmarking-dwarfs/data/benchmark-data.csv', 'rt') as f:`
			`for line in csv.reader(f):`
			`fs = HelperFunctions.get_fs(line[0])`
fuse-archive now no longer shows up when unnecessary 2024-11-18 19:10:43 -06:00			`if fs == 'fuse-archive (tar)' and skip_fuse_archive_tar:`
			`continue`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`label = HelperFunctions.get_label(line[1])`
DRY 2024-11-18 09:35:14 -06:00			`data['labels'].append(label) if label not in data[`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`'labels'`
			`] else False`
			`try:`
DRY 2024-11-18 09:35:14 -06:00			`data[fs].append(line[single_files_index])`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`except KeyError:`
DRY 2024-11-18 09:35:14 -06:00			`data[fs] = []`
			`data[fs].append(line[single_files_index])`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`# NOTE: this will break if the bulk data contains a larger unit than the single file data, but that's unlikely to happen so I'm not gonna deal with it`
			`# and it's a bit broken regardless but whatever`
			`largest_time_unit = 'ns'`
DRY 2024-11-18 09:35:14 -06:00			`for key in data.keys():`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`if key == 'labels':`
			`continue`
DRY 2024-11-18 09:35:14 -06:00			`for item in data[key]:`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`if largest_time_unit == 's':`
			`break`
			`if item.endswith('ms'):`
			`largest_time_unit = 'ms'`
			`elif item.endswith('µs') and largest_time_unit != 'ms':`
			`largest_time_unit = 'µs'`
			`elif (`
			`item.endswith('ns')`
			`and largest_time_unit != 'ms'`
			`and largest_time_unit != 'µs'`
			`):`
			`largest_time_unit = 'ns'`
			`elif re.sub('[0-9\\.]', '', item) == 's':`
			`largest_time_unit = 's'`
			`break`

DRY 2024-11-18 09:35:14 -06:00			`for key in data.keys():`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`if key == 'labels':`
			`continue`
DRY 2024-11-18 09:35:14 -06:00			`for i in range(len(data[key])):`
			`data[key][i] = HelperFunctions.convert_time(`
			`data[key][i], largest_time_unit`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`)`

			`with open('assets/benchmarking-dwarfs/data/bulk.csv', 'rt') as f:`
			`for line in csv.reader(f):`
DRY 2024-11-18 09:35:14 -06:00			`if line[2] != bulk_test_name:`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`continue`
			`fs = HelperFunctions.get_fs(line[0])`
			`label = HelperFunctions.get_label(line[1])`
DRY 2024-11-18 09:35:14 -06:00			`data['labels'].append(label) if label not in data[`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`'labels'`
			`] else False`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`for item in line[3:]:`
fuse-archive now no longer shows up when unnecessary 2024-11-18 19:10:43 -06:00			`# FIXME: this breaks if the bulk time is a larger unit than the single file time`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`if largest_time_unit == 's':`
			`break`
			`if item.endswith('ms'):`
			`largest_time_unit = 'ms'`
			`elif item.endswith('µs') and largest_time_unit != 'ms':`
			`largest_time_unit = 'µs'`
			`elif (`
			`item.endswith('ns')`
			`and largest_time_unit != 'ms'`
			`and largest_time_unit != 'µs'`
			`):`
			`largest_time_unit = 'ns'`
			`elif re.sub('[0-9]\\.', '', item) == 's':`
			`largest_time_unit = 's'`
			`break`
fuse-archive now no longer shows up when unnecessary 2024-11-18 19:10:43 -06:00
			`# on the single file tests fuse-archive fails, and it's so small is shows as 0 here anyways, so might as well skip it`
			`if fs == 'fuse-archive (tar)' and largest_time_unit == 's' and skip_fuse_archive_tar:`
			`continue`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00
			`for i in range(len(line[3:])):`
			`line[i + 3] = HelperFunctions.convert_time(item, largest_time_unit)`

fuse-archive now no longer shows up when unnecessary 2024-11-18 19:10:43 -06:00			`try:`
			`data[fs].append(sum(line[3:]) / len(line[3:]))`
			`except KeyError:`
			`data[fs] = [0, 0, 0, 0]`
			`data[fs].append(sum(line[3:]) / len(line[3:]))`


FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00
DRY 2024-11-18 09:35:14 -06:00			`return (data, largest_time_unit)`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00

DRY 2024-11-18 09:35:14 -06:00			`def run(single_files_index: int, bulk_test_name: str, filename: str, title: str, chart_canvas_id: str):`
			`with open(f'assets/benchmarking-dwarfs/js/{filename}', 'wt') as f:`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`# from https://github.com/chartjs/Chart.js/blob/master/docs/scripts/utils.js (CHART_COLORS)`
			`# modified so similar color aren't adjacent`
			`chart_colors = [`
			`"'rgb(255, 99, 132)'", # red`
			`"'rgb(75, 192, 192)'", # green`
			`"'rgb(54, 162, 235)'", # blue`
			`"'rgb(255, 159, 64)'", # orange`
			`"'rgb(153, 102, 255)'", # purple`
			`"'rgb(255, 205, 86)'", # yellow`
			`"'rgb(201, 203, 207)'", # grey`
			`]`

			`labels_code = 'labels = $labels$'`
			`dataset_code = '''`
			`{`
			`label: '$label$',`
			`data: $data$,`
			`backgroundColor: $color$,`
			`},`
			`'''`

			`config_code = '''`
			`config = {`
			`type: 'bar',`
			`data: {`
			`datasets: data,`
			`labels`
			`},`
			`options: {`
			`plugins: {`
			`title: {`
			`display: true,`
			`text: '$title$ - in $timeunit$'`
			`},`
			`},`
			`responsive: true,`
			`interaction: {`
			`intersect: false,`
			`},`
			`}`
			`};`
			`'''`

DRY 2024-11-18 09:35:14 -06:00			`data, largest_time_unit = get_data(single_files_index, bulk_test_name)`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`labels_code = labels_code.replace('$labels$', format(data['labels']))`
			`f.write(labels_code)`
			`data.pop('labels')`
			`f.write('\ndata = [')`
			`for fs in data.keys():`
			`f.write(`
			`dataset_code.replace('$label$', fs)`
			`.replace('$data$', format(data[fs]))`
			`.replace('$color$', format(chart_colors[list(data.keys()).index(fs)]))`
			`)`
			`f.write('\n]\n')`

			`f.write(`
			`config_code.replace('$title$', title).replace(`
			`'$timeunit$', largest_time_unit`
			`)`
			`)`

			`f.write('\nChart.defaults.borderColor = "#eee"\n')`
			`f.write('Chart.defaults.color = "#eee";\n')`
DRY 2024-11-18 09:35:14 -06:00			`f.write(f'ctx = document.getElementById("{chart_canvas_id}");\n')`
FINALLY add the rest of the graphs with this hellish code i will be repenting to the DRY gods for the rest of eternity 2024-11-17 23:40:06 -06:00			`f.write('new Chart(ctx, config);\n')`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
DRY 2024-11-18 09:35:14 -06:00			`def declare_vars():`
			`with open('assets/benchmarking-dwarfs/js/declare_vars.js', 'wt') as f:`
			`f.write('let labels;\n')`
			`f.write('let config;\n')`
			`f.write('let data;\n')`
			`f.write('let ctx;\n')`
fully automate seq_latency js creation 2024-11-17 15:42:47 -06:00
			`if __name__ == '__main__':`
DRY 2024-11-18 09:35:14 -06:00			`declare_vars()`
			`run(2, 'bulk_sequential_read', 'seq_read.js', 'Sequential Read Times', 'seq_read_chart')`
			`run(3, 'bulk_random_read', 'rand_read.js', 'Random Read Times', 'rand_read_chart')`
			`run(4, 'bulk_sequential_read_latency', 'seq_latency.js', 'Sequential Read Latency', 'seq_read_latency_chart')`
			`run(5, 'bulk_random_read_latency', 'rand_latency.js', 'Random Read Latency', 'rand_read_latency_chart')`