Skip to content

Commit

Permalink
Filter duplicates (google#1699)
Browse files Browse the repository at this point in the history
1. Update CF dependency to `2.6.0` to use `crash comparer`.
2. Replace the naive `duplicated` function from `pandas` with
`crash_comparer` from `clusterfuzz`.
  • Loading branch information
DonggeLiu authored Feb 14, 2023
1 parent 27bee8e commit aa1ed29
Show file tree
Hide file tree
Showing 7 changed files with 369 additions and 3 deletions.
27 changes: 25 additions & 2 deletions analysis/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Utility functions for data (frame) transformations."""
import pandas as pd
from clusterfuzz.stacktraces.crash_comparer import CrashComparer

from analysis import stat_tests
from common import benchmark_utils
Expand Down Expand Up @@ -128,17 +129,39 @@ def filter_max_time(experiment_df, max_time):
return experiment_df[experiment_df['time'] <= max_time]


def is_unique_crash(crash_group):
"""Check if each crash in |crash_group| is unique with CF's crash comparer.
Return the |crash_group| with an extra columns representing if that crash
is the first occurrence."""
unique_crashes = set()
is_firsts = []
for crash in crash_group.crash_key:
# crash_key is an concatenation of crash type and crash state:
# '{crash_type}:{crash_state}'
crash_state = ':'.join(str(crash).split(':')[1:])
is_unique = True
for unique_crash in unique_crashes:
if CrashComparer(crash_state, unique_crash).is_similar():
is_unique = False
break
unique_crashes.add(crash_state)
is_firsts.append(is_unique)
crash_group['firsts'] = is_firsts
return crash_group.firsts


def add_bugs_covered_column(experiment_df):
"""Return a modified experiment df in which adds a |bugs_covered| column,
a cumulative count of bugs covered over time."""
if 'crash_key' not in experiment_df:
experiment_df['bugs_covered'] = 0
return experiment_df
grouping1 = ['fuzzer', 'benchmark', 'trial_id', 'crash_key']
grouping2 = ['fuzzer', 'benchmark', 'trial_id']
grouping3 = ['fuzzer', 'benchmark', 'trial_id', 'time']
df = experiment_df.sort_values(grouping3)
df['firsts'] = ~df.duplicated(subset=grouping1) & ~df.crash_key.isna()
df['firsts'] = (
df.groupby(grouping2, group_keys=False).apply(is_unique_crash) &
~df.crash_key.isna())
df['bugs_cumsum'] = df.groupby(grouping2)['firsts'].transform('cumsum')
df['bugs_covered'] = (
df.groupby(grouping3)['bugs_cumsum'].transform('max').astype(int))
Expand Down
138 changes: 138 additions & 0 deletions analysis/test_data/bug_experiment_1_df.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
Unnamed: 0,git_hash,experiment_filestore,experiment,fuzzer,benchmark,time_started,time_ended,trial_id,time,edges_covered,fuzzer_stats,crash_key
0,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,0,4,,
1,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,0,4,,
2,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,0,4,,
3,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,0,4,,
4,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,0,24,,
5,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,0,24,,
6,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,0,24,,
7,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,0,24,,
8,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,900,2853,,
9,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,900,3211,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
10,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,900,3211,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
11,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,900,3106,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
12,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,900,3106,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
13,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,900,3105,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
14,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,900,3105,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
15,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,900,3870,,
16,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,900,3981,,
17,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,900,5322,,
18,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,900,5111,,
19,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,1800,3219,,
20,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,1800,2857,,
21,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,1800,3112,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
22,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,1800,3112,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
23,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,1800,3130,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
24,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,1800,3130,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
25,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,1800,4042,,
26,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,1800,4025,,
27,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,1800,5355,,
28,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,1800,5366,,
29,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,2700,3220,,
30,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,2700,3131,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
31,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,2700,4059,,
32,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,2700,4240,,
33,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,2700,2862,,
34,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,2700,3113,,
35,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,2700,5394,,
36,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,2700,5467,,
37,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,3600,3221,,
38,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,3600,5400,,
39,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,3600,5468,,
40,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,3600,4087,,
41,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,3600,4266,,
42,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,3600,2869,,
43,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,3600,3114,,
44,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,3600,3132,,
45,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,4500,3221,,
46,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,4500,5476,,
47,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,4500,5400,,
48,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,4500,4294,,
49,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,4500,2869,,
50,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,4500,3132,,
51,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,4500,3114,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
52,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,4500,4147,,
53,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,5400,3221,,
54,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,5400,5407,,
55,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,5400,5525,,
56,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,5400,4156,,
57,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,5400,4313,,
58,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,5400,2873,,
59,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,5400,3132,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
60,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,5400,3132,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
61,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,5400,3115,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
62,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,5400,3115,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
63,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,6300,3221,,
64,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,6300,3132,,
65,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,6300,5525,,
66,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,6300,5428,,
67,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,6300,4320,,
68,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,6300,2873,,
69,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,6300,4166,,
70,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,6300,3135,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
71,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,6300,3135,,"Null-dereference:aspeller::SfxEntry::check
aspeller::AffixMgr::suffix_check
aspeller::AffixMgr::munch
"
72,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.223215,,107,7200,3221,,
73,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,aspell_aspell_fuzzer,2023-02-10 09:51:02.193808,,108,7200,2873,,
74,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.213901,,112,7200,3135,,
75,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,aspell_aspell_fuzzer,2023-02-10 09:51:02.185009,,111,7200,3132,,"Null-dereference:aspeller::PfxEntry::check
aspeller::AffixMgr::prefix_check
aspeller::AffixMgr::munch
"
76,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.165263,,105,7200,4324,,
77,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.149702,,109,7200,5429,,
78,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,afl,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.205513,,110,7200,5534,,
79,415c95cf5b5d2f29a7222b5c06685b2416f67be0,/tmp/experiment-data,generate-bug-df-18,libfuzzer,arrow_parquet-arrow-fuzz,2023-02-10 09:51:02.210657,,106,7200,4174,,
Loading

0 comments on commit aa1ed29

Please sign in to comment.