Fixes minor style issues

deepjavalibrary · Aug 29, 2024 · c00228b · c00228b
1 parent 7eea99f
commit c00228b
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 92 deletions.
diff --git a/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java b/wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java
@@ -493,7 +493,7 @@ static boolean isValidTrtLlmModelRepo(Path modelPath) throws IOException {
     // This represents  the config of huggingface models NLP models as well
     // as the config of diffusers models. The config is different for both, but for
     // now we can leverage a single class since we don't need too much information from the config.
-    static final class HuggingFaceModelConfig {
+    public static final class HuggingFaceModelConfig {
 
         @SerializedName("model_type")
         private String modelType;
@@ -550,7 +550,7 @@ public Set<String> getArchitectures() {
          * max_position_embeddings and 4096. If both max_position_embeddings and 4096 are not
          * available, this function returns 0.
          *
-         * @return The default value for n_positions.
+         * @return the default value for n_positions
          */
         public int getDefaultNPositions() {
             return Math.min(maxPositionEmbeddings, 4096);
@@ -562,7 +562,7 @@ public int getDefaultNPositions() {
          * number of hidden layers, vocabulary size, and number of attention heads and key-value
          * heads to calculate the total parameter count.
          *
-         * @return The total parameter count for the model.
+         * @return the total parameter count for the model
          */
         private long getLlamaLikeParameterCount() {
             long headDim = (long) numAttentionHeads * numKeyValueHeads;
@@ -588,13 +588,13 @@ private long getLlamaLikeParameterCount() {
          * of hidden layers, vocabulary size, and number of attention heads to calculate the total
          * parameter count.
          *
-         * @return The total parameter count for the model.
+         * @return the total parameter count for the model
          */
         private long getDefaultParameterCount() {
             long embeddingLayerTotal = (long) (vocabSize + maxPositionEmbeddings) * hiddenSize;
-            long attentionTotal = (long) 4 * (hiddenSize * hiddenSize);
-            long feedForwardTotal = (long) 8 * (hiddenSize * hiddenSize);
-            long layerNormTotal = (long) 4 * hiddenSize;
+            long attentionTotal = 4L * hiddenSize * hiddenSize;
+            long feedForwardTotal = 8L * hiddenSize * hiddenSize;
+            long layerNormTotal = 4L * hiddenSize;
             long transformerBlockTotal =
                     (attentionTotal + feedForwardTotal + layerNormTotal) * numHiddenLayers;
             long finalLayerTotal = (long) hiddenSize * vocabSize;
@@ -604,17 +604,13 @@ private long getDefaultParameterCount() {
         /**
          * Calculates the total parameter count for the model.
          *
-         * @return The total parameter count for the model.
+         * @return the total parameter count for the model
          */
         public long getModelParameters() {
-            try {
-                if ("llama".equals(modelType) || "mistral".equals(modelType)) {
-                    return getLlamaLikeParameterCount();
-                }
-                return getDefaultParameterCount();
-            } catch (Exception e) {
-                return 0L;
+            if ("llama".equals(modelType) || "mistral".equals(modelType)) {
+                return getLlamaLikeParameterCount();
             }
+            return getDefaultParameterCount();
         }
 
         /**
@@ -623,9 +619,9 @@ public long getModelParameters() {
          * <p>The memory required is calculated as the product of the sequence length, hidden size,
          * number of hidden layers, and weight in bytes.
          *
-         * @param sequenceLength The length in tokens of the sequence.
-         * @param weightBytes The weight in bytes.
-         * @return The memory required to store a single batch of sequence data.
+         * @param sequenceLength the length in tokens of the sequence
+         * @param weightBytes the weight in bytes
+         * @return the memory required to store a single batch of sequence data
          */
         public long getApproxMemoryForSingleSequence(int sequenceLength, int weightBytes) {
             return (long) sequenceLength * hiddenSize * numHiddenLayers * weightBytes;

diff --git a/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java b/wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java
@@ -12,6 +12,7 @@
  */
 package ai.djl.serving.wlm;
 
+import ai.djl.serving.wlm.LmiUtils.HuggingFaceModelConfig;
 import ai.djl.util.NeuronUtils;
 
 import java.util.ArrayList;
@@ -26,11 +27,9 @@ public class NeuronSmartDefaultUtils {
     private static final float MEMORY_PER_CORE =
             16.0F; // Currently there is only one config w/ 16 gb per core
 
-    // Internal settings
-    private Integer nPositions;
-    private Integer availableCores;
-    private Float modelSizeInGb;
-    private Float sequenceSizeInGb;
+    private int availableCores;
+    private float modelSizeInGb;
+    private float sequenceSizeInGb;
 
     /**
      * Applies smart defaults for Neuron models.
@@ -43,10 +42,10 @@ public class NeuronSmartDefaultUtils {
      *   <li>option.max_rolling_batch_size: A heuristic based on available memory.
      * </ul>
      *
-     * @param prop The properties to update.
-     * @param modelConfig The model configuration to use.
+     * @param prop the properties to update
+     * @param modelConfig the model configuration to use
      */
-    public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) {
+    public void applySmartDefaults(Properties prop, HuggingFaceModelConfig modelConfig) {
         if (!prop.containsKey("option.n_positions")) {
             prop.setProperty(
                     "option.n_positions", String.valueOf(modelConfig.getDefaultNPositions()));
@@ -62,9 +61,9 @@ public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig
      * @param prop The properties to retrieve settings from.
      * @param modelConfig The model configuration to use for calculations.
      */
-    private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) {
-        clearInternalSettings();
-        nPositions = Integer.parseInt(prop.getProperty("option.n_positions"));
+    private void setInternalSettings(Properties prop, HuggingFaceModelConfig modelConfig) {
+        // Internal settings
+        int nPositions = Integer.parseInt(prop.getProperty("option.n_positions", "0"));
         if (NeuronUtils.hasNeuron()) {
             availableCores = NeuronUtils.getNeuronCores();
         } else {
@@ -85,32 +84,13 @@ private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfi
      * degree. The adjustment is based on the estimated memory increase due to the tensor parallel
      * degree.
      *
-     * @param tpDegree The tensor parallel degree.
-     * @return The adjusted model size in GB.
+     * @param tpDegree the tensor parallel degree
+     * @return the adjusted model size in GB
      */
     private float getAdjustedModelSizeInGb(int tpDegree) {
         return modelSizeInGb * (1.0F + ((tpDegree * 2 - 2) / 100.0F));
     }
 
-    /**
-     * Clears the internal settings for this NeuronSmartDefaultUtils instance.
-     *
-     * <p>This method clears the following fields:
-     *
-     * <ul>
-     *   <li>{@link #nPositions}
-     *   <li>{@link #availableCores}
-     *   <li>{@link #modelSizeInGb}
-     *   <li>{@link #sequenceSizeInGb}
-     * </ul>
-     */
-    private void clearInternalSettings() {
-        nPositions = null;
-        availableCores = null;
-        modelSizeInGb = null;
-        sequenceSizeInGb = null;
-    }
-
     /**
      * Sets a heuristic value for tensor parallel degree if not already set in model properties.
      *
@@ -173,8 +153,8 @@ private void setHeuristicNeuronTPDegree(Properties prop) {
     /**
      * Finds the largest power of 2 less than or equal to n.
      *
-     * @param n The input number.
-     * @return The largest power of 2 less than or equal to n.
+     * @param n the input number
+     * @return the largest power of 2 less than or equal to n
      */
     private int getMaxPowerOf2(int n) {
         if (n != 0 && (n & (n - 1)) == 0) {
@@ -194,8 +174,8 @@ private int getMaxPowerOf2(int n) {
      * <p>The maximum number of concurrent requests is calculated as the largest power of 2 less
      * than or equal to the total memory divided by the sequence size.
      *
-     * @param totalMemory The total memory available for the model.
-     * @return The maximum number of concurrent requests.
+     * @param totalMemory the total memory available for the model
+     * @return the maximum number of concurrent requests
      */
     private int getMaxConcurrency(float totalMemory, int tpDegree) {
         int maxConcurrency =
@@ -208,9 +188,9 @@ private int getMaxConcurrency(float totalMemory, int tpDegree) {
      * Calculates the maximum number of concurrent requests that can be served by a model given the
      * total memory available for the model and the sequence size.
      *
-     * @param totalMemory The total memory available for the model.
-     * @param batchSize The maximum number of requests that can be processed in a single batch.
-     * @return The maximum number of concurrent requests that can be served.
+     * @param totalMemory the total memory available for the model
+     * @param batchSize the maximum number of requests that can be processed in a single batch
+     * @return the maximum number of concurrent requests that can be served
      */
     private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batchSize) {
         int maxConcurrency =
@@ -219,9 +199,8 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc
         maxConcurrency = Math.min(maxConcurrency, batchSize);
         if (maxConcurrency == batchSize) {
             return maxConcurrency;
-        } else {
-            return 0;
         }
+        return 0;
     }
 
     /**
@@ -231,7 +210,7 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc
      * number of cores. This method returns a list of available core configurations for the given
      * number of cores.
      *
-     * @return The list of available core configurations.
+     * @return the list of available core configurations
      */
     private List<Integer> availableCoreConfigs() {
         List<Integer> coreConfigs = new ArrayList<>();
@@ -252,8 +231,8 @@ private List<Integer> availableCoreConfigs() {
      * number of cores. This method returns a list of available core configurations for the given
      * number of cores.
      *
-     * @param nCores The number of cores to build the configurations for.
-     * @return The list of available core configurations.
+     * @param nCores the number of cores to build the configurations for
+     * @return the list of available core configurations
      */
     private List<Integer> buildCoreConfigs(int nCores) {
         List<Integer> coreConfigs = new ArrayList<>();
@@ -279,14 +258,10 @@ private List<Integer> buildCoreConfigs(int nCores) {
      * @param prop The properties to set the max rolling batch size to.
      */
     private void setHeuristicNeuronMaxRollingBatch(Properties prop) {
-        int tpDegree;
-        try {
-            tpDegree = Integer.parseInt(prop.getProperty("option.tensor_parallel_degree"));
-        } catch (Exception e) {
-            // if tensor parallel degree exists and is not an integer, it is max, use all available
-            // cores
-            tpDegree = availableCores;
-        }
+        int tpDegree =
+                Integer.parseInt(
+                        prop.getProperty(
+                                "option.tensor_parallel_degree", String.valueOf(availableCores)));
         if (!prop.containsKey("option.max_rolling_batch_size")) {
             int maxRollingBatchSize = getMaxConcurrency(tpDegree * MEMORY_PER_CORE, tpDegree);
             if (maxRollingBatchSize > 0) {