From 6ef869333d8dbf9f9080c86e8810d8d44772b470 Mon Sep 17 00:00:00 2001 From: Frank Liu Date: Thu, 29 Aug 2024 13:32:12 -0700 Subject: [PATCH] Fixes minor style issues --- .../java/ai/djl/serving/wlm/LmiUtils.java | 32 ++++---- .../serving/wlm/NeuronSmartDefaultUtils.java | 79 +++++++------------ .../wlm/NeuronSmartDefaultUtilsTest.java | 44 +++++------ 3 files changed, 63 insertions(+), 92 deletions(-) diff --git a/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java b/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java index 3800207102..c1b98cc6c0 100644 --- a/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java +++ b/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java @@ -493,7 +493,7 @@ static boolean isValidTrtLlmModelRepo(Path modelPath) throws IOException { // This represents the config of huggingface models NLP models as well // as the config of diffusers models. The config is different for both, but for // now we can leverage a single class since we don't need too much information from the config. - static final class HuggingFaceModelConfig { + public static final class HuggingFaceModelConfig { @SerializedName("model_type") private String modelType; @@ -550,7 +550,7 @@ public Set getArchitectures() { * max_position_embeddings and 4096. If both max_position_embeddings and 4096 are not * available, this function returns 0. * - * @return The default value for n_positions. + * @return the default value for n_positions */ public int getDefaultNPositions() { return Math.min(maxPositionEmbeddings, 4096); @@ -562,7 +562,7 @@ public int getDefaultNPositions() { * number of hidden layers, vocabulary size, and number of attention heads and key-value * heads to calculate the total parameter count. * - * @return The total parameter count for the model. + * @return the total parameter count for the model */ private long getLlamaLikeParameterCount() { long headDim = (long) numAttentionHeads * numKeyValueHeads; @@ -588,13 +588,13 @@ private long getLlamaLikeParameterCount() { * of hidden layers, vocabulary size, and number of attention heads to calculate the total * parameter count. * - * @return The total parameter count for the model. + * @return the total parameter count for the model */ private long getDefaultParameterCount() { long embeddingLayerTotal = (long) (vocabSize + maxPositionEmbeddings) * hiddenSize; - long attentionTotal = (long) 4 * (hiddenSize * hiddenSize); - long feedForwardTotal = (long) 8 * (hiddenSize * hiddenSize); - long layerNormTotal = (long) 4 * hiddenSize; + long attentionTotal = 4L * hiddenSize * hiddenSize; + long feedForwardTotal = 8L * hiddenSize * hiddenSize; + long layerNormTotal = 4L * hiddenSize; long transformerBlockTotal = (attentionTotal + feedForwardTotal + layerNormTotal) * numHiddenLayers; long finalLayerTotal = (long) hiddenSize * vocabSize; @@ -604,17 +604,13 @@ private long getDefaultParameterCount() { /** * Calculates the total parameter count for the model. * - * @return The total parameter count for the model. + * @return the total parameter count for the model */ public long getModelParameters() { - try { - if ("llama".equals(modelType) || "mistral".equals(modelType)) { - return getLlamaLikeParameterCount(); - } - return getDefaultParameterCount(); - } catch (Exception e) { - return 0L; + if ("llama".equals(modelType) || "mistral".equals(modelType)) { + return getLlamaLikeParameterCount(); } + return getDefaultParameterCount(); } /** @@ -623,9 +619,9 @@ public long getModelParameters() { *

The memory required is calculated as the product of the sequence length, hidden size, * number of hidden layers, and weight in bytes. * - * @param sequenceLength The length in tokens of the sequence. - * @param weightBytes The weight in bytes. - * @return The memory required to store a single batch of sequence data. + * @param sequenceLength the length in tokens of the sequence + * @param weightBytes the weight in bytes + * @return 6he memory required to store a single batch of sequence data */ public long getApproxMemoryForSingleSequence(int sequenceLength, int weightBytes) { return (long) sequenceLength * hiddenSize * numHiddenLayers * weightBytes; diff --git a/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java b/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java index 8f19e2bb54..350853278a 100644 --- a/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java +++ b/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java @@ -12,6 +12,7 @@ */ package ai.djl.serving.wlm; +import ai.djl.serving.wlm.LmiUtils.HuggingFaceModelConfig; import ai.djl.util.NeuronUtils; import java.util.ArrayList; @@ -26,11 +27,9 @@ public class NeuronSmartDefaultUtils { private static final float MEMORY_PER_CORE = 16.0F; // Currently there is only one config w/ 16 gb per core - // Internal settings - private Integer nPositions; - private Integer availableCores; - private Float modelSizeInGb; - private Float sequenceSizeInGb; + private int availableCores; + private float modelSizeInGb; + private float sequenceSizeInGb; /** * Applies smart defaults for Neuron models. @@ -43,10 +42,10 @@ public class NeuronSmartDefaultUtils { *

  • option.max_rolling_batch_size: A heuristic based on available memory. * * - * @param prop The properties to update. - * @param modelConfig The model configuration to use. + * @param prop the properties to update + * @param modelConfig the model configuration to use */ - public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) { + public void applySmartDefaults(Properties prop, HuggingFaceModelConfig modelConfig) { if (!prop.containsKey("option.n_positions")) { prop.setProperty( "option.n_positions", String.valueOf(modelConfig.getDefaultNPositions())); @@ -62,9 +61,9 @@ public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig * @param prop The properties to retrieve settings from. * @param modelConfig The model configuration to use for calculations. */ - private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) { - clearInternalSettings(); - nPositions = Integer.parseInt(prop.getProperty("option.n_positions")); + private void setInternalSettings(Properties prop, HuggingFaceModelConfig modelConfig) { + // Internal settings + int nPositions = Integer.parseInt(prop.getProperty("option.n_positions", "0")); if (NeuronUtils.hasNeuron()) { availableCores = NeuronUtils.getNeuronCores(); } else { @@ -85,32 +84,13 @@ private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfi * degree. The adjustment is based on the estimated memory increase due to the tensor parallel * degree. * - * @param tpDegree The tensor parallel degree. - * @return The adjusted model size in GB. + * @param tpDegree the tensor parallel degree + * @return the adjusted model size in GB */ private float getAdjustedModelSizeInGb(int tpDegree) { return modelSizeInGb * (1.0F + ((tpDegree * 2 - 2) / 100.0F)); } - /** - * Clears the internal settings for this NeuronSmartDefaultUtils instance. - * - *

    This method clears the following fields: - * - *

      - *
    • {@link #nPositions} - *
    • {@link #availableCores} - *
    • {@link #modelSizeInGb} - *
    • {@link #sequenceSizeInGb} - *
    - */ - private void clearInternalSettings() { - nPositions = null; - availableCores = null; - modelSizeInGb = null; - sequenceSizeInGb = null; - } - /** * Sets a heuristic value for tensor parallel degree if not already set in model properties. * @@ -173,8 +153,8 @@ private void setHeuristicNeuronTPDegree(Properties prop) { /** * Finds the largest power of 2 less than or equal to n. * - * @param n The input number. - * @return The largest power of 2 less than or equal to n. + * @param n the input number + * @return the largest power of 2 less than or equal to n */ private int getMaxPowerOf2(int n) { if (n != 0 && (n & (n - 1)) == 0) { @@ -194,8 +174,8 @@ private int getMaxPowerOf2(int n) { *

    The maximum number of concurrent requests is calculated as the largest power of 2 less * than or equal to the total memory divided by the sequence size. * - * @param totalMemory The total memory available for the model. - * @return The maximum number of concurrent requests. + * @param totalMemory the total memory available for the model + * @return the maximum number of concurrent requests */ private int getMaxConcurrency(float totalMemory, int tpDegree) { int maxConcurrency = @@ -208,9 +188,9 @@ private int getMaxConcurrency(float totalMemory, int tpDegree) { * Calculates the maximum number of concurrent requests that can be served by a model given the * total memory available for the model and the sequence size. * - * @param totalMemory The total memory available for the model. - * @param batchSize The maximum number of requests that can be processed in a single batch. - * @return The maximum number of concurrent requests that can be served. + * @param totalMemory the total memory available for the model + * @param batchSize the maximum number of requests that can be processed in a single batch + * @return the maximum number of concurrent requests that can be served */ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batchSize) { int maxConcurrency = @@ -219,9 +199,8 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc maxConcurrency = Math.min(maxConcurrency, batchSize); if (maxConcurrency == batchSize) { return maxConcurrency; - } else { - return 0; } + return 0; } /** @@ -231,7 +210,7 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc * number of cores. This method returns a list of available core configurations for the given * number of cores. * - * @return The list of available core configurations. + * @return the list of available core configurations */ private List availableCoreConfigs() { List coreConfigs = new ArrayList<>(); @@ -252,8 +231,8 @@ private List availableCoreConfigs() { * number of cores. This method returns a list of available core configurations for the given * number of cores. * - * @param nCores The number of cores to build the configurations for. - * @return The list of available core configurations. + * @param nCores the number of cores to build the configurations for + * @return the list of available core configurations */ private List buildCoreConfigs(int nCores) { List coreConfigs = new ArrayList<>(); @@ -279,14 +258,10 @@ private List buildCoreConfigs(int nCores) { * @param prop The properties to set the max rolling batch size to. */ private void setHeuristicNeuronMaxRollingBatch(Properties prop) { - int tpDegree; - try { - tpDegree = Integer.parseInt(prop.getProperty("option.tensor_parallel_degree")); - } catch (Exception e) { - // if tensor parallel degree exists and is not an integer, it is max, use all available - // cores - tpDegree = availableCores; - } + int tpDegree = + Integer.parseInt( + prop.getProperty( + "option.tensor_parallel_degree", String.valueOf(availableCores))); if (!prop.containsKey("option.max_rolling_batch_size")) { int maxRollingBatchSize = getMaxConcurrency(tpDegree * MEMORY_PER_CORE, tpDegree); if (maxRollingBatchSize > 0) { diff --git a/wlm/src/test/java/ai/djl/serving/wlm/NeuronSmartDefaultUtilsTest.java b/wlm/src/test/java/ai/djl/serving/wlm/NeuronSmartDefaultUtilsTest.java index 740b44c031..1d73186b01 100644 --- a/wlm/src/test/java/ai/djl/serving/wlm/NeuronSmartDefaultUtilsTest.java +++ b/wlm/src/test/java/ai/djl/serving/wlm/NeuronSmartDefaultUtilsTest.java @@ -12,6 +12,7 @@ */ package ai.djl.serving.wlm; +import ai.djl.serving.wlm.LmiUtils.HuggingFaceModelConfig; import ai.djl.util.JsonUtils; import ai.djl.util.NeuronUtils; @@ -61,13 +62,13 @@ public void testApplySmartDefaults70BModel() throws IOException { Properties prop = new Properties(); LmiUtils.HuggingFaceModelConfig modelConfig = get70BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } Assert.assertEquals(prop.getProperty("option.n_positions"), "4096"); Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1"); - Assert.assertEquals(prop.containsKey("option.max_rolling_batch_size"), false); + Assert.assertFalse(prop.containsKey("option.max_rolling_batch_size")); } @Test @@ -76,7 +77,7 @@ public void testApplySmartDefaultsQuantize8BModel() throws IOException { prop.setProperty("option.quantize", "static_int8"); LmiUtils.HuggingFaceModelConfig modelConfig = get8BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -90,7 +91,7 @@ public void testApplySmartDefaults2BModel() throws IOException { Properties prop = new Properties(); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -105,7 +106,7 @@ public void testApplySmartDefaultsQuantize2BModel() throws IOException { prop.setProperty("option.quantize", "static_int8"); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -120,7 +121,7 @@ public void testApplySmartDefaultsWithNPositions() throws IOException { prop.setProperty("option.n_positions", "128"); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -134,7 +135,7 @@ public void testApplySmartDefaultsWithTPDegree() throws IOException { prop.setProperty("option.tensor_parallel_degree", "1"); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -148,7 +149,7 @@ public void testApplySmartDefaultsWithMaxRollingBatch() throws IOException { prop.setProperty("option.max_rolling_batch_size", "64"); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -162,7 +163,7 @@ public void testApplySmartDefaultsWithTPMax() throws IOException { prop.setProperty("option.tensor_parallel_degree", "max"); LmiUtils.HuggingFaceModelConfig modelConfig = get2BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(false); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(false); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -176,8 +177,8 @@ public void testApplySmartDefaultsWithNeuron() throws IOException { Properties prop = new Properties(); LmiUtils.HuggingFaceModelConfig modelConfig = get70BLlamaHuggingFaceModelConfig(); try (MockedStatic mockedStatic = Mockito.mockStatic(NeuronUtils.class)) { - mockedStatic.when(() -> NeuronUtils.hasNeuron()).thenReturn(true); - mockedStatic.when(() -> NeuronUtils.getNeuronCores()).thenReturn(32); + mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(true); + mockedStatic.when(NeuronUtils::getNeuronCores).thenReturn(32); NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils(); smartDefaultUtils.applySmartDefaults(prop, modelConfig); } @@ -187,44 +188,43 @@ public void testApplySmartDefaultsWithNeuron() throws IOException { } // Helper methods - public LmiUtils.HuggingFaceModelConfig get2BLlamaHuggingFaceModelConfig() throws IOException { + public HuggingFaceModelConfig get2BLlamaHuggingFaceModelConfig() throws IOException { try (Reader reader = Files.newBufferedReader( Paths.get("src/test/resources/smart-default-model/2b/config.json"))) { - return JsonUtils.GSON.fromJson(reader, LmiUtils.HuggingFaceModelConfig.class); + return JsonUtils.GSON.fromJson(reader, HuggingFaceModelConfig.class); } } - public LmiUtils.HuggingFaceModelConfig get8BLlamaHuggingFaceModelConfig() throws IOException { + public HuggingFaceModelConfig get8BLlamaHuggingFaceModelConfig() throws IOException { try (Reader reader = Files.newBufferedReader( Paths.get("src/test/resources/smart-default-model/8b/config.json"))) { - return JsonUtils.GSON.fromJson(reader, LmiUtils.HuggingFaceModelConfig.class); + return JsonUtils.GSON.fromJson(reader, HuggingFaceModelConfig.class); } } - public LmiUtils.HuggingFaceModelConfig get70BLlamaHuggingFaceModelConfig() throws IOException { + public HuggingFaceModelConfig get70BLlamaHuggingFaceModelConfig() throws IOException { try (Reader reader = Files.newBufferedReader( Paths.get("src/test/resources/smart-default-model/70b/config.json"))) { - return JsonUtils.GSON.fromJson(reader, LmiUtils.HuggingFaceModelConfig.class); + return JsonUtils.GSON.fromJson(reader, HuggingFaceModelConfig.class); } } - public LmiUtils.HuggingFaceModelConfig getDefaultHuggingFaceModelConfig() throws IOException { + public HuggingFaceModelConfig getDefaultHuggingFaceModelConfig() throws IOException { try (Reader reader = Files.newBufferedReader( Paths.get("src/test/resources/smart-default-model/unit/config.json"))) { - return JsonUtils.GSON.fromJson(reader, LmiUtils.HuggingFaceModelConfig.class); + return JsonUtils.GSON.fromJson(reader, HuggingFaceModelConfig.class); } } - public LmiUtils.HuggingFaceModelConfig getNoParametersHuggingFaceModelConfig() - throws IOException { + public HuggingFaceModelConfig getNoParametersHuggingFaceModelConfig() throws IOException { try (Reader reader = Files.newBufferedReader( Paths.get("src/test/resources/smart-default-model/empty/config.json"))) { - return JsonUtils.GSON.fromJson(reader, LmiUtils.HuggingFaceModelConfig.class); + return JsonUtils.GSON.fromJson(reader, HuggingFaceModelConfig.class); } } }