fix uniform sampling strategy

uwdb · Dec 21, 2023 · 999c880 · 999c880
1 parent c7bfd62
commit 999c880
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 16 deletions.
diff --git a/Experiments/Scripts/run-cycle-experiments.jl b/Experiments/Scripts/run-cycle-experiments.jl
@@ -2,14 +2,14 @@ using Plots.PlotMeasures
 include("../Experiments.jl")
 
 # datasets::Vector{DATASET} = [aids, human, lubm80, yago, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
-data = [yeast]
+datasets = [youtube]
 max_cycles = 6
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, max_cycle_size=current_size) for current_dataset in data for current_size in 2:max_cycles]
+experiment_params_list = ExperimentParams[ExperimentParams(dataset=current_dataset, max_cycle_size=current_size) for current_dataset in datasets for current_size in 1:max_cycles]
 
 # println("started building")
-#build_experiments(experiment_params_list)
+build_experiments(experiment_params_list)
 # println("started estimating")
-#run_estimation_experiments(experiment_params_list)
+run_estimation_experiments(experiment_params_list)
 println("started graphing")
 
 graph_grouped_box_plot(experiment_params_list;

diff --git a/Experiments/Scripts/run-inference-sampling-experiments.jl b/Experiments/Scripts/run-inference-sampling-experiments.jl
@@ -2,34 +2,41 @@ using Plots.PlotMeasures
 include("../Experiments.jl")
 
 # datasets::Vector{DATASET} = [aids, human, lubm80, yago, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
-data::Vector{DATASET} = [yeast]
-max_paths = [10, 100, 1000, 10000]
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, inference_max_paths=current_paths) for current_dataset in data for current_paths in max_paths]
+datasets = [youtube]
+max_paths = [10, 100, 500, 2000, 10000]
+experiment_params_list = ExperimentParams[]
+for dataset in datasets
+    for current_paths in max_paths
+        push!(experiment_params_list, ExperimentParams(dataset=dataset, inference_max_paths=current_paths, sampling_strategy = redistributive, description="Importance"))
+        push!(experiment_params_list, ExperimentParams(dataset=dataset, inference_max_paths=current_paths, sampling_strategy = uniform, description="Uniform"))
+    end
+end
+
 println("started building")
 #build_experiments(experiment_params_list)
 println("started estimating")
 #run_estimation_experiments(experiment_params_list)
 println("started graphing")
 graph_grouped_box_plot(experiment_params_list,
-                        x_type=dataset,
+                        x_type=inference_paths,
                         y_type=estimate_error,
                         ylims=[10^-20, 10^20],
                         y_ticks=[10^-20, 10^-15, 10^-10, 10^-5, 1, 10^5, 10^10, 10^15],
-                        dimensions = (600, 550),
+                        dimensions = (600, 400),
                         legend_pos = :topleft,
                         x_label="Maximum Inference Paths",
                         y_label="Estimate Error 10^",
-                        grouping=inference_paths,
+                        grouping=description,
                         filename="inference-paths-error")
 
 graph_grouped_box_plot(experiment_params_list,
-                        x_type=dataset,
+                        x_type=inference_paths,
                         y_type=runtime,
                         ylims=[.0001, 100],
                         y_ticks=[.001, .01, .1, 1, 10, 100],
-                        dimensions = (600, 550),
+                        dimensions = (600, 400),
                         legend_pos = :topleft,
                         x_label="Maximum Inference Paths",
                         y_label="Runtime 10^ (s)",
-                        grouping=inference_paths,
+                        grouping=description,
                         filename="inference-paths-runtime")
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
@@ -60,7 +60,7 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
                             [log10(y)  for y in y_values],
                             group = groups,
                             x_ticks = x_ticks,
-                            xlims = [0, length(x_order)],
+                            xlims = [0, length(x_order) + .5],
                             ylims =  (log10(ylims[1]),log10(ylims[2])),
                             y_ticks = [log10(y) for y in y_ticks],
                             legend = legend_pos,

diff --git a/Experiments/run_estimators.jl b/Experiments/run_estimators.jl
@@ -17,6 +17,9 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
                                     only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)) for _ in 1:3]
             estimate_time = median([x.time for x in  estimate_results]) # Convert back to seconds from nano seconds
             estimate = max(1, estimate_results[1].value)
+            if isinf(estimate) || isnan(estimate)
+                estimate = 1.0
+            end
             query_type = all_queries[dataset][i].query_type
             experiment_results[i] = (estimate, exact_size, estimate_time, query_type, query_path, nv(query.graph))
         end

diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl
@@ -70,7 +70,7 @@ function sample_paths(partial_paths::Matrix{Color}, partial_weights::Vector{W},
     sample_weights = [get_count(w) for w in new_partial_weights]
     sample_weights = AnalyticWeights(sample_weights ./ overall_bounds_sum)
     if sampling_strategy == uniform
-        sample_weights = AnalyticWeights([1.0 for i in eachindex(new_partial_paths)] ./ num_nonzero_entries)
+        sample_weights = AnalyticWeights([1.0 for i in eachindex(new_partial_weights)] ./ length(new_partial_weights))
     end
     sample_indices::Vector{Int} = sample(1:length(new_partial_weights), sample_weights,  num_samples; replace=false)
 
@@ -89,7 +89,7 @@ function sample_paths(partial_paths::Matrix{Color}, partial_weights::Vector{W},
             # scale the weights so that their sum equals the input weight's sum
             overall_bounds_sum / sampled_bounds_sum
         else
-            1.0 / sample_weights[i]
+            1.0 / (sample_weights[i] * num_samples)
         end
         sampled_partial_weights[i] = scale_coloring(new_partial_weights[idx], inverse_sampling_probability)
     end