From 7db0f4f17c187826f520bd3ad4f920c9fd894fdb Mon Sep 17 00:00:00 2001 From: Alberto Spelta Date: Mon, 5 Aug 2024 14:37:27 +0200 Subject: [PATCH] Add batch size control for column statistics analysis This commit introduces a new constant, DefaultColumnBatchSize in the StatExtractor to define the default number of rows processed at a time during column statistics analysis. The UpdateStatisticsModel method has been updated to include a columnBatchSize parameter, which is then passed to the LoadColumnStatistics method. --- src/Dax.Model.Extractor/StatExtractor.cs | 13 +++++++++---- src/Dax.Model.Extractor/TomExtractor.cs | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/Dax.Model.Extractor/StatExtractor.cs b/src/Dax.Model.Extractor/StatExtractor.cs index 925980a..8443719 100644 --- a/src/Dax.Model.Extractor/StatExtractor.cs +++ b/src/Dax.Model.Extractor/StatExtractor.cs @@ -8,6 +8,11 @@ namespace Dax.Model.Extractor { public class StatExtractor { + /// + /// The default number of rows processed at a time during column statistics analysis. + /// + public const int DefaultColumnBatchSize = 50; + protected Dax.Metadata.Model DaxModel { get; private set; } protected IDbConnection Connection { get; private set; } protected int CommandTimeout { get; private set; } = 0; @@ -26,7 +31,7 @@ protected IDbCommand CreateCommand(string commandText) // UpdateStatisticsModel has been marked as obsolete because its usage may require rerunning the DMVs for models with DirectLake partitions. Since this logic should be handled by the library, we may consider removing it from the public APIs in a future release. [Obsolete("This method may produce incomplete results if used on a model with DirectLake partitions and DirectLakeExtractionMode parameter set to anything other than ResidentOnly. Use TomExtractor.GetDaxModel instead.")] - public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly) + public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int columnBatchSize = DefaultColumnBatchSize) { // TODO: Remove after rafactoring the code to use ExtractorSettings: ExtractorProperties as a parameter daxModel.ExtractorProperties.StatisticsEnabled = true; @@ -36,7 +41,7 @@ public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnect StatExtractor extractor = new StatExtractor(daxModel, connection); extractor.LoadTableStatistics(analyzeDirectQuery, analyzeDirectLake); - extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake); + extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake, columnBatchSize); extractor.LoadRelationshipStatistics(sampleRows, analyzeDirectQuery, analyzeDirectLake); // Update ExtractionDate @@ -242,7 +247,7 @@ private static string EmbedNameInString(string originalName) { return originalName.Replace("\"", "\"\""); } - private void LoadColumnStatistics(bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly) + private void LoadColumnStatistics(bool analyzeDirectQuery, DirectLakeExtractionMode analyzeDirectLake, int columnBatchSize) { var allColumns = (from t in DaxModel.Tables @@ -257,7 +262,7 @@ from c in t.Columns || (analyzeDirectLake == DirectLakeExtractionMode.Full) ) select c).ToList(); - var loopColumns = allColumns.SplitList(50); // no more than 9999 + var loopColumns = allColumns.SplitList(columnBatchSize); // no more than 9999 foreach ( var columnSet in loopColumns ) { var idString = 0; var dax = "EVALUATE "; diff --git a/src/Dax.Model.Extractor/TomExtractor.cs b/src/Dax.Model.Extractor/TomExtractor.cs index 41882d5..299c362 100644 --- a/src/Dax.Model.Extractor/TomExtractor.cs +++ b/src/Dax.Model.Extractor/TomExtractor.cs @@ -284,7 +284,7 @@ public static Dax.Metadata.Model GetDaxModel(Tom.Model model, string extractorAp return extractor.DaxModel; } - public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly) + public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize) { Tom.Server server = new Tom.Server(); server.Connect(connectionString); @@ -304,7 +304,7 @@ public static Dax.Metadata.Model GetDaxModel(string connectionString, string app if (readStatisticsFromData) { #pragma warning disable CS0618 // Type or member is obsolete - StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake); + StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize); #pragma warning restore CS0618 // Type or member is obsolete // If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded. @@ -334,7 +334,7 @@ public static Tom.Database GetDatabase(string connectionString) return db ?? throw new ArgumentException($"The database '{databaseName}' could not be found. Either it does not exist or you do not have admin rights to it."); } - public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly) + public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize) { Tom.Database db = GetDatabase(serverName, databaseName); Tom.Model tomModel = db.Model; @@ -352,7 +352,7 @@ public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseN if (readStatisticsFromData) { #pragma warning disable CS0618 // Type or member is obsolete - StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake); + StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize); #pragma warning restore CS0618 // Type or member is obsolete // If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded.