some updates to graphing

uwdb · Dec 26, 2023 · 2f6862d · 2f6862d
2 parents 681062a + 03ec42e
commit 2f6862d
Show file tree

Hide file tree

Showing 23 changed files with 848 additions and 289 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,5 +12,5 @@
 /TrueCardinalities
 /Experiments/Results/*
 /Experiments/SerializedSummaries/*
-
+/.vscode
 
diff --git a/Experiments/Experiments.jl b/Experiments/Experiments.jl
@@ -1,19 +1,28 @@
 # Overall Experiments Harness Include File
-using Serialization: serialize, deserialize
+using BenchmarkTools
 using Plots
 using Plots.PlotMeasures
-using StatsPlots
-using CSV, DataFrames
-using Parquet2: Dataset
-using DelimitedFiles: writedlm
-using BenchmarkTools
+using Printf
 using Random
+using StatsPlots
+using Distributed
+@everywhere using CSV
+@everywhere using DataFrames
+@everywhere using DelimitedFiles: writedlm
+@everywhere using Parquet2: Dataset
+@everywhere using Random
+@everywhere using Serialization: serialize, deserialize
+@everywhere using SharedArrays
+@everywhere using WeakRefStrings
+
 
-include("../Source/CardinalityWithColors.jl")
-include("utils.jl")
-include("load_datasets.jl")
+@everywhere include("../Source/CardinalityWithColors.jl")
+@everywhere include("utils.jl")
+@everywhere include("load_datasets.jl")
 include("load_querysets.jl")
-include("build_color_summaries.jl")
+@everywhere include("build_color_summaries.jl")
 include("get_true_cardinalities.jl")
-include("run_estimators.jl")
+@everywhere include("run_estimators.jl")
 include("graph_results.jl")
+
+const TIMEOUT_SEC::Float64 = 60.0
diff --git a/Experiments/Scripts/coloring_methods.jl b/Experiments/Scripts/coloring_methods.jl
@@ -11,7 +11,6 @@ partitioning_schemes = [
                         [(Hash, 64)],
                         [(Degree, 8), (QuasiStable, 32), (NeighborNodeLabels, 24)],
                         [(Degree, 8), (NeighborNodeLabels, 24), (QuasiStable, 32)]]
-partitioning_schemes = [[(QuasiStable, 64)]]
 experiment_params = Vector{ExperimentParams}()
 for dataset in datasets
     for scheme in partitioning_schemes

diff --git a/Experiments/Scripts/comparison_exps.jl b/Experiments/Scripts/comparison_exps.jl
@@ -2,19 +2,12 @@
 using Profile
 include("../Experiments.jl")
 
-#datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
-datasets = [yeast]
+#datasets = [human, aids, lubm80, yeast, hprd, dblp, youtube, eu2005, patents, wordnet]
+datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
+#datasets = [human, aids, lubm80]
 
 experiment_params = Vector{ExperimentParams}()
 for dataset in datasets
-    push!(experiment_params, ExperimentParams(deg_stats_type=CorrDegStats,
-                                                dataset=dataset,
-                                                partitioning_scheme=[(QuasiStable, 64)],
-                                                description = "CorrQ64"))
-    push!(experiment_params, ExperimentParams(deg_stats_type=CorrDegStats,
-                                                dataset=dataset,
-                                                partitioning_scheme=[(QuasiStable, 32), (NeighborNodeLabels, 32),(QuasiStable, 32), (NeighborNodeLabels, 32)],
-                                                description = "CorrQ64N64"))
     push!(experiment_params, ExperimentParams(deg_stats_type=AvgDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 64)],
@@ -23,23 +16,81 @@ for dataset in datasets
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 32), (NeighborNodeLabels, 32),(QuasiStable, 32), (NeighborNodeLabels, 32)],
                                                 description = "AvgQ64N64"))
+#=
     push!(experiment_params, ExperimentParams(deg_stats_type=MinDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 64)],
+                                                max_cycle_size = -1,
                                                 description = "MinQ64"))
     push!(experiment_params, ExperimentParams(deg_stats_type=MaxDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(QuasiStable, 64)],
+                                                max_cycle_size = -1,
                                                 description = "MaxQ64"))
+
     push!(experiment_params, ExperimentParams(deg_stats_type=MaxDegStats,
                                                 dataset=dataset,
                                                 partitioning_scheme=[(Hash, 64)],
-                                                description = "MaxH64"))
+                                                max_cycle_size = -1,
+                                                inference_max_paths = 10^30,
+                                                use_partial_sums = false,
+                                                description = "BSK"))
+
+    push!(experiment_params, ExperimentParams(deg_stats_type=AvgDegStats,
+                                                dataset=dataset,
+                                                partitioning_scheme=[(QuasiStable, 1)],
+                                                max_cycle_size = -1,
+                                                description = "IndEst")) =#
 end
 
-build_experiments(experiment_params)
+#build_experiments(experiment_params)
+
+run_estimation_experiments(experiment_params; timeout=1.0)
+
+order = [string(data) for data in datasets]
+
+graph_grouped_boxplot_with_comparison_methods(experiment_params;
+                                                ylims=[10^-5, 10^4],
+                                                y_ticks=[10^-5, 10^-4, 10^-3, 10^-2, 10^-1, 10^0, 10^1, 10^2, 10^3, 10^4],
+                                                y_type = runtime,
+                                                x_type = dataset,
+                                                x_order = order,
+                                                grouping=description,
+                                                dimensions = (1450, 550),
+                                                legend_pos=:top,
+                                                y_label="Inference Latency 10^ (s)",
+                                                filename="overall_runtime")
+
+graph_grouped_boxplot_with_comparison_methods(experiment_params;
+                                                ylims=[10^-21, 10^21],
+                                                y_ticks=[10^-20, 10^-15, 10^-10, 10^-5, 10^-2, 10^0, 10^2, 10^5, 10^10, 10^15, 10^20],
+                                                y_type = estimate_error,
+                                                x_type = dataset,
+                                                x_order = order,
+                                                grouping=description,
+                                                dimensions = (1450, 550),
+                                                legend_pos=:bottomleft,
+                                                y_label="Relative Error 10^",
+                                                filename="overall_error")
+
 
-run_estimation_experiments(experiment_params)
+graph_grouped_bar_plot(experiment_params;
+                        grouping=description,
+                        y_type=memory_footprint,
+                        x_order = order,
+                        ylims=[0, 50],
+                        y_ticks = [10, 20, 30, 40, 50],
+                        legend_pos=:topright,
+                        dimensions = (1000, 550),
+                        y_label="Memory (MBs)",
+                        filename="overall_memory")
 
-graph_grouped_boxplot_with_comparison_methods(experiment_params; ylims=[10^-5, 10^2], y_type = runtime, grouping=description, y_label="Runtime (s)", filename="comparison_exps_runtime_2")
-graph_grouped_boxplot_with_comparison_methods(experiment_params; ylims=[10^-10, 10^15], y_type = estimate_error, grouping=description, y_label="Relative Error", filename="comparison_exps_error_2")
+graph_grouped_bar_plot(experiment_params;
+                        grouping=description,
+                        y_type=build_time,
+                        x_order = order,
+                        ylims=[0, 1600],
+                        y_ticks = [200, 400, 600, 800, 1000, 1200, 1400, 1600],
+                        dimensions = (1000, 550),
+                        y_label="Build Time (s)",
+                        filename="overall_build_time")
diff --git a/Experiments/Scripts/estimator-failure.jl b/Experiments/Scripts/estimator-failure.jl
@@ -0,0 +1,94 @@
+include("../Experiments.jl")
+
+#datasets = [human, aids]
+datasets = [human, aids, lubm80, yeast, dblp, youtube, eu2005, patents]
+queries = load_querysets(datasets)
+num_queries = Dict(string(dataset)=>length(queries[dataset]) for dataset in datasets)
+
+methods, comparison_results = comparison_dataset()
+
+failure_counts = Dict()
+failure_probabilities = Dict()
+for method in methods
+    failure_counts[method] = counter(String)
+    failure_probabilities[method] = Dict()
+    for dataset in datasets
+        string_dataset = string(dataset)
+        for query in queries[dataset]
+            qid = get_query_id(string_dataset, query.query_path)
+            comp_key = (string_dataset, method, qid)
+            if !haskey(comparison_results, comp_key)
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == 0
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == Inf
+                inc!(failure_counts[method], string_dataset)
+            elseif comparison_results[comp_key].Estimate == NaN
+                inc!(failure_counts[method], string_dataset)
+            end
+        end
+        failure_probabilities[method][dataset] = failure_counts[method][string_dataset] / num_queries[string_dataset]
+    end
+end
+
+failure_counts["BSK"] = counter(String)
+failure_counts["BSK++"] = counter(String)
+failure_counts["AvgQ64"] = counter(String)
+failure_probabilities["BSK"] = Dict()
+failure_probabilities["BSK++"] = Dict()
+failure_probabilities["AvgQ64"] = Dict()
+for dataset in datasets
+    string_dataset = string(dataset)
+    bsk_params = ExperimentParams(deg_stats_type=MaxDegStats,
+                                    dataset=dataset,
+                                    partitioning_scheme=[(Hash, 64)],
+                                    max_cycle_size = -1,
+                                    inference_max_paths = 10^30,
+                                    use_partial_sums = false,
+                                    description = "BSK",
+                                    n_replications = 1)
+    run_estimation_experiments([bsk_params]; timeout=TIMEOUT_SEC)
+    bsk_filename = params_to_results_filename(bsk_params)
+    bsk_path = "Experiments/Results/Estimation_" * bsk_filename
+    bsk_df = CSV.read(bsk_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(bsk_df)
+        if bsk_df[i, :Failure]
+            inc!(failure_counts["BSK"], string_dataset)
+        end
+    end
+    failure_probabilities["BSK"][string_dataset] = failure_counts["BSK"][string_dataset] / num_queries[string_dataset]
+
+
+    bsk_agg_params = ExperimentParams(deg_stats_type=MaxDegStats,
+                                    dataset=dataset,
+                                    partitioning_scheme=[(Hash, 64)],
+                                    max_cycle_size = -1,
+                                    inference_max_paths = 10^30,
+                                    use_partial_sums = true,
+                                    description = "BSK++",
+                                    n_replications=1)
+    run_estimation_experiments([bsk_agg_params]; timeout=TIMEOUT_SEC)
+    bsk_agg_filename = params_to_results_filename(bsk_agg_params)
+    bsk_agg_path = "Experiments/Results/Estimation_" * bsk_agg_filename
+    bsk_agg_df = CSV.read(bsk_agg_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(bsk_agg_df)
+        if bsk_agg_df[i, :Failure]
+            inc!(failure_counts["BSK++"], string_dataset)
+        end
+    end
+    failure_probabilities["BSK++"][string_dataset] = failure_counts["BSK++"][string_dataset] / num_queries[string_dataset]
+
+
+
+    avg_params = ExperimentParams(dataset=dataset, n_replications=1)
+    run_estimation_experiments([avg_params]; timeout=TIMEOUT_SEC)
+    avg_filename = params_to_results_filename(avg_params)
+    avg_path = "Experiments/Results/Estimation_" * avg_filename
+    avg_df = CSV.read(avg_path, DataFrame; normalizenames=true)
+    for i in 1:nrow(avg_df)
+        if avg_df[i, :Failure]
+            inc!(failure_counts["AvgQ64"], string_dataset)
+        end
+    end
+    failure_probabilities["AvgQ64"][string_dataset] = failure_counts["AvgQ64"][string_dataset] / num_queries[string_dataset]
+end
diff --git a/Experiments/Scripts/just-edge-updates-experiments.jl b/Experiments/Scripts/just-edge-updates-experiments.jl
@@ -7,7 +7,7 @@ datasets::Vector{DATASET} = [aids, human]
 # datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
 # datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
 max_cycles = 6
-proportions_updated = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
+proportions_updated = [0, 0.2, 0.4, 0.6, 0.8, .9, 1.0]
 # To test deletion, we will add a random node / edge and then delete them...
 # proportion_not_updated = 0.5
 
@@ -94,7 +94,7 @@ experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=cur
 # graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
 # compare how cycle stat accuracies are affected by summary updates
 # graph_grouped_box_plot(experiment_params_list, x_type=proportion_deleted, y_type=estimate_error, x_label="proportion added then deleted", y_label="accuracy", grouping=cycle_size, filename="deletion-experiment")
-graph_grouped_bar_plot(experiment_params_list, x_type=dataset, y_type=build_time, y_lims=[0, 30], x_label="Proportion Updated", y_label="Build Time (S)", grouping=proportion_updated, filename="just-edge-updates-build")
-graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, x_label="Proportion Updated", y_label="Estimate Error", grouping=proportion_updated, filename="just-edge-updates-error")
-graph_grouped_bar_plot(experiment_params_list, x_type=dataset, y_type=runtime, y_lims=[0, 0.6], x_label="Proportion Updated", y_label="Runtime (S)", grouping=proportion_updated, filename="just-edge-updates-runtime")
-graph_grouped_bar_plot(experiment_params_list, x_type=dataset, y_type=memory_footprint, y_lims=[0, 20], x_label="Proportion Updated", y_label="Memory Footprint (B)", grouping=proportion_updated, filename="just-edge-updates-memory")
+graph_grouped_bar_plot(experiment_params_list, x_type=dataset, y_type=build_time, y_lims=[0, 10], y_ticks = [0, 2, 4 ,6 ,8, 10], legend_pos=:topright, x_label="Proportion Updated", y_label="Build Time (S)", grouping=proportion_updated, filename="just-edge-updates-build")
+graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error,y_lims=[-20, 15],  x_label="Proportion Updated", y_label="Estimate Error", grouping=proportion_updated, filename="just-edge-updates-error")
+graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=runtime, y_lims=[10^-5, 10], y_ticks = [10^-5, 10^-4, 10^-3, 10^-2, 10^-1, 1, 10], x_label="Proportion Updated", y_label="Runtime (S)", grouping=proportion_updated, filename="just-edge-updates-runtime")
+graph_grouped_bar_plot(experiment_params_list, x_type=dataset, y_type=memory_footprint, y_lims=[0, 20], y_ticks = [0, 5, 10, 15, 20], x_label="Proportion Updated", y_label="Memory (MB)", grouping=proportion_updated, filename="just-edge-updates-memory")