-
Notifications
You must be signed in to change notification settings - Fork 281
/
Copy pathexperiment_results.py
363 lines (306 loc) · 14 KB
/
experiment_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ExperimentResults class."""
import functools
import os
import pandas as pd
import seaborn as sns
from analysis import benchmark_results
from analysis import coverage_data_utils
from analysis import data_utils
from analysis import stat_tests
from common import experiment_utils
def strip_gs_protocol(url):
"""Removes the leading gs:// from |url|."""
protocol = 'gs://'
if url.startswith(protocol):
return url[len(protocol):]
return url
class ExperimentResults: # pylint: disable=too-many-instance-attributes
"""Provides the main interface for getting various analysis results and
plots about an experiment, represented by |experiment_df|.
Can be used as the context of template based report generation. Each
result is a property, which is lazily computed and memorized when
needed multiple times. Therefore, when used as a context of a report
template, only the properties needed for the given report will be computed.
"""
# Summary table style
_SUMMARY_TABLE_STYLE = [
dict(selector='td, th',
props=[('width', '25px'), ('padding', '7px 5px')]),
dict(selector='th.col_heading',
props=[('max-width', '25px'), ('overflow', 'visible'),
('transform-origin', 'bottom left'),
('transform', 'translateX(20px) rotate(-45deg)')])
]
def __init__( # pylint: disable=too-many-arguments
self,
experiment_df,
coverage_dict,
output_directory,
plotter,
experiment_name=None):
if experiment_name:
self.name = experiment_name
else:
# Take name from first row.
self.name = experiment_df.experiment.iloc[0]
# FuzzBench repo commit hash.
self.git_hash = None
if 'git_hash' in experiment_df.columns:
# Not possible to represent hashes for multiple experiments.
if len(experiment_df.experiment.unique()) == 1:
self.git_hash = experiment_df.git_hash.iloc[0]
# Earliest trial start time.
self.started = experiment_df.time_started.dropna().min()
# Latest trial end time.
self.ended = experiment_df.time_ended.dropna().max()
# Keep a full version of the dataframe (to count unique bugs)
self._full_experiment_df = experiment_df
# Keep data frame without non-interesting columns.
experiment_df = data_utils.drop_uninteresting_columns(experiment_df)
# Add relative columns (% of experiment max, % of fuzzer max)
self._experiment_df = data_utils.add_relative_columns(experiment_df)
# Directory where the rendered plots are written to.
self._output_directory = output_directory
self._plotter = plotter
# Dictionary to store the full coverage data.
self._coverage_dict = coverage_dict
self.experiment_filestore = strip_gs_protocol(
experiment_df.experiment_filestore.iloc[0])
def _get_full_path(self, filename):
return os.path.join(self._output_directory, filename)
def linkify_names(self, df):
"""For any DataFrame which is indexed by fuzzer names, turns the fuzzer
names into links to their directory with a description on GitHub."""
assert df.index.name == 'fuzzer'
def description_link(commit, fuzzer):
return (f'<a href="https://github.com/google/fuzzbench/blob/'
f'{commit}/fuzzers/{fuzzer}">{fuzzer}</a>')
commit = self.git_hash if self.git_hash else 'master'
df.index = df.index.map(lambda fuzzer: description_link(commit, fuzzer))
return df
@functools.cached_property
def _experiment_snapshots_df(self):
"""Data frame containing only the time snapshots, for each benchmark,
based on which we do further analysis, i.e., statistical tests and
ranking."""
return data_utils.get_experiment_snapshots(self._experiment_df)
@property
@functools.lru_cache()
def benchmarks(self):
"""Returns the list of BenchmarkResults.
This is cheap as no computation is done on the benchmark data,
until a property is evaluated.
"""
benchmark_names = self._experiment_df.benchmark.unique()
return [
benchmark_results.BenchmarkResults(name, self._experiment_df,
self._coverage_dict,
self._output_directory,
self._plotter)
for name in sorted(benchmark_names)
]
@property
@functools.lru_cache()
def type(self):
"""Returns the type of the experiment i.e., 'code' or 'bug', indicating
whether the experiments involved code coverage benchmarks or bug
coverage benchmarks.
Raises ValueError if the benchmark types are mixed.
"""
benchmarks = [benchmark.name for benchmark in self.benchmarks]
return experiment_utils.get_experiment_type(benchmarks)
@property
def _relevant_column(self):
"""Returns the name of the column that will be used as the basis of
the analysis (e.g., 'edges_covered', or 'bugs_covered')."""
return 'edges_covered' if self.type == 'code' else 'bugs_covered'
@property
@functools.lru_cache()
def summary_table(self):
"""A pivot table of medians for each fuzzer on each benchmark."""
return data_utils.experiment_pivot_table(
self._experiment_snapshots_df,
functools.partial(data_utils.benchmark_rank_by_median,
key=self._relevant_column))
def _relative_summary_table(self, key_column='edges_covered'):
"""A pivot table of medians ( % of experiment max per benchmark )
for each fuzzer on each benchmark."""
pivot = data_utils.experiment_pivot_table(
self._experiment_snapshots_df,
functools.partial(data_utils.benchmark_rank_by_percent,
key=key_column))
# Remove names
pivot = pivot.rename_axis(index=None, columns=None)
# Add rows for Median and Mean values
nrows, _ = pivot.shape
pivot.loc['FuzzerMedian'] = pivot.iloc[0:nrows].median()
pivot.loc['FuzzerMean'] = pivot.iloc[0:nrows].mean()
# Sort fuzzers left to right by FuzzerMean
pivot = pivot.sort_values(by='FuzzerMean', axis=1, ascending=False)
# Move Median and Mean to top rows
row_index = pivot.index.to_list()
pivot = pivot.reindex(row_index[-2:] + row_index[:-2])
# Mean row slicer
idx = pd.IndexSlice['FuzzerMean', :]
whbl = sns.light_palette('lightblue', n_colors=30, as_cmap=True)
pivot = pivot.style\
.background_gradient(axis=1, cmap=whbl, vmin=95, vmax=100)\
.highlight_max(axis=1, color='lightgreen')\
.format('{:.2f}')\
.apply(data_utils.underline_row, axis=1, subset=idx)\
.set_table_styles(self._SUMMARY_TABLE_STYLE)
return pivot
@property
@functools.lru_cache()
def relative_code_summary_table(self):
"""Summary table of median relative code coverage."""
return self._relative_summary_table()
@property
@functools.lru_cache()
def relative_bug_summary_table(self):
"""Summary table of median relative bug coverage."""
return self._relative_summary_table(key_column='bugs_covered')
@property
def found_bugs_summary_table(self):
"""A pivot table of total found bugs by each fuzzer on each
bug benchmark."""
grouping = ['benchmark', 'fuzzer']
groups = self._full_experiment_df.groupby(grouping).crash_key.nunique()
groups = groups.reset_index()
pivot = groups.pivot(index='benchmark',
columns='fuzzer',
values='crash_key')
# save fuzzer names
fuzzer_names = pivot.columns
pivot['Total'] = self._full_experiment_df.groupby(
'benchmark').crash_key.nunique()
pivot = pivot.rename_axis(index=None, columns=None)
# Add row for sum of all bugs found
nrows, _ = pivot.shape
pivot.loc['FuzzerSum'] = pivot.iloc[0:nrows].sum()
# Move Sum to top row
row_index = pivot.index.to_list()
pivot = pivot.reindex(row_index[-1:] + row_index[:-1])
# Sum row slicer
idx = pd.IndexSlice['FuzzerSum', :]
# highlight max (skip all zeros)
def highlight_max(row):
if row.sum() == 0:
return ['' for v in row]
row_max = row.max()
is_max = row == row_max
return ['background-color: lightgreen' if v else '' for v in is_max]
# Sort fuzzers left to right by FuzzerSum
pivot = pivot.sort_values(by='FuzzerSum', axis=1, ascending=False)
pivot = pivot.style\
.format('{:.0f}')\
.apply(highlight_max, axis=1, subset=fuzzer_names)\
.apply(data_utils.underline_row, axis=1, subset=idx)\
.set_table_styles(self._SUMMARY_TABLE_STYLE)
return pivot
@property
def rank_by_unique_coverage_average_normalized_score(self):
"""Rank fuzzers using average normalized score on unique code coverage
across benchmarks."""
benchmarks_unique_coverage_list = [
benchmark.unique_branch_cov_df for benchmark in self.benchmarks
]
return coverage_data_utils.rank_by_average_normalized_score(
benchmarks_unique_coverage_list)
def _ranking(self, benchmark_level_ranking_function,
experiment_level_ranking_function):
return data_utils.experiment_level_ranking(
self._experiment_snapshots_df,
functools.partial(benchmark_level_ranking_function,
key=self._relevant_column),
experiment_level_ranking_function)
@property
def rank_by_average_rank_and_average_rank(self):
"""Rank fuzzers using average rank per benchmark and average rank
across benchmarks."""
return self._ranking(data_utils.benchmark_rank_by_average_rank,
data_utils.experiment_rank_by_average_rank)
@property
def rank_by_mean_and_average_rank(self):
"""Rank fuzzers using mean coverage per benchmark and average rank
across benchmarks."""
return self._ranking(data_utils.benchmark_rank_by_mean,
data_utils.experiment_rank_by_average_rank)
@property
def rank_by_median_and_average_rank(self):
"""Rank fuzzers using median coverage per benchmark and average rank
across benchmarks."""
return self._ranking(data_utils.benchmark_rank_by_median,
data_utils.experiment_rank_by_average_rank)
@property
def rank_by_median_and_average_normalized_score(self):
"""Rank fuzzers using median coverage per benchmark and average
normalized score across benchmarks."""
return self._ranking(
data_utils.benchmark_rank_by_median,
data_utils.experiment_rank_by_average_normalized_score)
@property
def rank_by_median_and_number_of_firsts(self):
"""Rank fuzzers using median coverage per benchmark and number of first
places across benchmarks."""
return self._ranking(data_utils.benchmark_rank_by_median,
data_utils.experiment_rank_by_num_firsts)
@property
def rank_by_stat_test_wins_and_average_rank(self):
"""Rank fuzzers using statistical test wins per benchmark and average
rank across benchmarks."""
return self._ranking(data_utils.benchmark_rank_by_stat_test_wins,
data_utils.experiment_rank_by_num_firsts)
@property
def friedman_p_value(self):
"""Friedman test result."""
return stat_tests.friedman_test(self.summary_table)
@property
@functools.lru_cache()
def friedman_posthoc_p_values(self):
"""Friedman posthoc test results."""
return stat_tests.friedman_posthoc_tests(self.summary_table)
@property
def friedman_conover_plot(self):
"""Friedman/Conover posthoc test result plot."""
plot_filename = 'experiment_friedman_conover_plot.svg'
self._plotter.write_heatmap_plot(
self.friedman_posthoc_p_values['conover'],
self._get_full_path(plot_filename),
symmetric=True)
return plot_filename
@property
def friedman_nemenyi_plot(self):
"""Friedman/Nemenyi posthoc test result plot."""
plot_filename = 'experiment_friedman_nemenyi_plot.svg'
self._plotter.write_heatmap_plot(
self.friedman_posthoc_p_values['nemenyi'],
self._get_full_path(plot_filename),
symmetric=True)
return plot_filename
@property
def critical_difference_plot(self):
"""Critical difference diagram.
Represents average ranks of fuzzers across all benchmarks,
considering medians on final coverage.
"""
average_ranks = self.rank_by_median_and_average_rank
num_of_benchmarks = self.summary_table.shape[0]
plot_filename = 'experiment_critical_difference_plot.svg'
self._plotter.write_critical_difference_plot(
average_ranks, num_of_benchmarks,
self._get_full_path(plot_filename))
return plot_filename