Skip to content

Commit cbc452a

Browse files
Add batch size control for column statistics analysis (#135)
This commit introduces a new constant, DefaultColumnBatchSize in the StatExtractor to define the default number of rows processed at a time during column statistics analysis. The UpdateStatisticsModel method has been updated to include a columnBatchSize parameter, which is then passed to the LoadColumnStatistics method.
1 parent 3240d54 commit cbc452a

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

src/Dax.Model.Extractor/StatExtractor.cs

+9-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ namespace Dax.Model.Extractor
88
{
99
public class StatExtractor
1010
{
11+
/// <summary>
12+
/// The default number of rows processed at a time during column statistics analysis.
13+
/// </summary>
14+
public const int DefaultColumnBatchSize = 50;
15+
1116
protected Dax.Metadata.Model DaxModel { get; private set; }
1217
protected IDbConnection Connection { get; private set; }
1318
protected int CommandTimeout { get; private set; } = 0;
@@ -26,7 +31,7 @@ protected IDbCommand CreateCommand(string commandText)
2631

2732
// UpdateStatisticsModel has been marked as obsolete because its usage may require rerunning the DMVs for models with DirectLake partitions. Since this logic should be handled by the library, we may consider removing it from the public APIs in a future release.
2833
[Obsolete("This method may produce incomplete results if used on a model with DirectLake partitions and DirectLakeExtractionMode parameter set to anything other than ResidentOnly. Use TomExtractor.GetDaxModel instead.")]
29-
public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
34+
public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnection connection, int sampleRows = 0, bool analyzeDirectQuery = false , DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int columnBatchSize = DefaultColumnBatchSize)
3035
{
3136
// TODO: Remove after rafactoring the code to use ExtractorSettings: ExtractorProperties as a parameter
3237
daxModel.ExtractorProperties.StatisticsEnabled = true;
@@ -36,7 +41,7 @@ public static void UpdateStatisticsModel(Dax.Metadata.Model daxModel, IDbConnect
3641

3742
StatExtractor extractor = new StatExtractor(daxModel, connection);
3843
extractor.LoadTableStatistics(analyzeDirectQuery, analyzeDirectLake);
39-
extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake);
44+
extractor.LoadColumnStatistics(analyzeDirectQuery, analyzeDirectLake, columnBatchSize);
4045
extractor.LoadRelationshipStatistics(sampleRows, analyzeDirectQuery, analyzeDirectLake);
4146

4247
// Update ExtractionDate
@@ -242,7 +247,7 @@ private static string EmbedNameInString(string originalName)
242247
{
243248
return originalName.Replace("\"", "\"\"");
244249
}
245-
private void LoadColumnStatistics(bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
250+
private void LoadColumnStatistics(bool analyzeDirectQuery, DirectLakeExtractionMode analyzeDirectLake, int columnBatchSize)
246251
{
247252
var allColumns =
248253
(from t in DaxModel.Tables
@@ -257,7 +262,7 @@ from c in t.Columns
257262
|| (analyzeDirectLake == DirectLakeExtractionMode.Full)
258263
)
259264
select c).ToList();
260-
var loopColumns = allColumns.SplitList(50); // no more than 9999
265+
var loopColumns = allColumns.SplitList(columnBatchSize); // no more than 9999
261266
foreach ( var columnSet in loopColumns ) {
262267
var idString = 0;
263268
var dax = "EVALUATE ";

src/Dax.Model.Extractor/TomExtractor.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ public static Dax.Metadata.Model GetDaxModel(Tom.Model model, string extractorAp
284284
return extractor.DaxModel;
285285
}
286286

287-
public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
287+
public static Dax.Metadata.Model GetDaxModel(string connectionString, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize)
288288
{
289289
Tom.Server server = new Tom.Server();
290290
server.Connect(connectionString);
@@ -304,7 +304,7 @@ public static Dax.Metadata.Model GetDaxModel(string connectionString, string app
304304
if (readStatisticsFromData)
305305
{
306306
#pragma warning disable CS0618 // Type or member is obsolete
307-
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake);
307+
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize);
308308
#pragma warning restore CS0618 // Type or member is obsolete
309309

310310
// If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded.
@@ -334,7 +334,7 @@ public static Tom.Database GetDatabase(string connectionString)
334334
return db ?? throw new ArgumentException($"The database '{databaseName}' could not be found. Either it does not exist or you do not have admin rights to it.");
335335
}
336336

337-
public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly)
337+
public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseName, string applicationName, string applicationVersion, bool readStatisticsFromData = true, int sampleRows = 0, bool analyzeDirectQuery = false, DirectLakeExtractionMode analyzeDirectLake = DirectLakeExtractionMode.ResidentOnly, int statsColumnBatchSize = StatExtractor.DefaultColumnBatchSize)
338338
{
339339
Tom.Database db = GetDatabase(serverName, databaseName);
340340
Tom.Model tomModel = db.Model;
@@ -352,7 +352,7 @@ public static Dax.Metadata.Model GetDaxModel(string serverName, string databaseN
352352
if (readStatisticsFromData)
353353
{
354354
#pragma warning disable CS0618 // Type or member is obsolete
355-
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake);
355+
StatExtractor.UpdateStatisticsModel(daxModel, connection, sampleRows, analyzeDirectQuery, analyzeDirectLake, statsColumnBatchSize);
356356
#pragma warning restore CS0618 // Type or member is obsolete
357357

358358
// If model has any DL partitions and we have forced all columns into memory then re-run the DMVs to update the data with the new values after everything has been transcoded.

0 commit comments

Comments
 (0)