From ba1cf84c867648d9504e03e086aaf8721d87c60f Mon Sep 17 00:00:00 2001 From: Scott Sandre Date: Tue, 24 Sep 2024 15:56:12 -0700 Subject: [PATCH] Create project/README.md --- project/README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 project/README.md diff --git a/project/README.md b/project/README.md new file mode 100644 index 0000000000..8510aa2972 --- /dev/null +++ b/project/README.md @@ -0,0 +1,30 @@ +# Updating delta-spark TestParallelization Top 50 Slowest Test Suites List + +- Cherry-pick changes from https://github.com/delta-io/delta/pull/3694 +- That PR adds a test report listener to delta-spark that will output csv files containing per-JVM, per-group (thread), and per-test runtimes +- Run the CI and download the generated csv artifacts +- You can use the following pyspark code to get the top 50 slowest test suites +- You can copy and paste that into Chat GPT and ask it to format it as a Scala List + +```python +from pyspark.sql.functions import col, sum +from pyspark.sql.types import StructType, StructField, StringType, LongType + +schema = StructType([ + StructField("test_suite", StringType(), True), + StructField("test_name", StringType(), True), + StructField("execution_time_ms", LongType(), True), + StructField("result", StringType(), True) +]) + +csv_dir = "..." + +spark.read.csv(csv_dir, schema=schema) \ + .filter(col("execution_time_ms") != -1) \ + .groupBy("test_suite") \ + .agg((sum("execution_time_ms") / 60000).alias("execution_time_mins")) \ + .orderBy(col("execution_time_mins").desc()) \ + .limit(50) \ + .select("test_suite", "execution_time_mins") \ + .show(50, truncate=False) +```