Skip to content

Commit

Permalink
Fixes minor style issues
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu committed Aug 29, 2024
1 parent 7eea99f commit c00228b
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 92 deletions.
32 changes: 14 additions & 18 deletions wlm/src/main/java/ai/djl/serving/wlm/LmiUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ static boolean isValidTrtLlmModelRepo(Path modelPath) throws IOException {
// This represents the config of huggingface models NLP models as well
// as the config of diffusers models. The config is different for both, but for
// now we can leverage a single class since we don't need too much information from the config.
static final class HuggingFaceModelConfig {
public static final class HuggingFaceModelConfig {

@SerializedName("model_type")
private String modelType;
Expand Down Expand Up @@ -550,7 +550,7 @@ public Set<String> getArchitectures() {
* max_position_embeddings and 4096. If both max_position_embeddings and 4096 are not
* available, this function returns 0.
*
* @return The default value for n_positions.
* @return the default value for n_positions
*/
public int getDefaultNPositions() {
return Math.min(maxPositionEmbeddings, 4096);
Expand All @@ -562,7 +562,7 @@ public int getDefaultNPositions() {
* number of hidden layers, vocabulary size, and number of attention heads and key-value
* heads to calculate the total parameter count.
*
* @return The total parameter count for the model.
* @return the total parameter count for the model
*/
private long getLlamaLikeParameterCount() {
long headDim = (long) numAttentionHeads * numKeyValueHeads;
Expand All @@ -588,13 +588,13 @@ private long getLlamaLikeParameterCount() {
* of hidden layers, vocabulary size, and number of attention heads to calculate the total
* parameter count.
*
* @return The total parameter count for the model.
* @return the total parameter count for the model
*/
private long getDefaultParameterCount() {
long embeddingLayerTotal = (long) (vocabSize + maxPositionEmbeddings) * hiddenSize;
long attentionTotal = (long) 4 * (hiddenSize * hiddenSize);
long feedForwardTotal = (long) 8 * (hiddenSize * hiddenSize);
long layerNormTotal = (long) 4 * hiddenSize;
long attentionTotal = 4L * hiddenSize * hiddenSize;
long feedForwardTotal = 8L * hiddenSize * hiddenSize;
long layerNormTotal = 4L * hiddenSize;
long transformerBlockTotal =
(attentionTotal + feedForwardTotal + layerNormTotal) * numHiddenLayers;
long finalLayerTotal = (long) hiddenSize * vocabSize;
Expand All @@ -604,17 +604,13 @@ private long getDefaultParameterCount() {
/**
* Calculates the total parameter count for the model.
*
* @return The total parameter count for the model.
* @return the total parameter count for the model
*/
public long getModelParameters() {
try {
if ("llama".equals(modelType) || "mistral".equals(modelType)) {
return getLlamaLikeParameterCount();
}
return getDefaultParameterCount();
} catch (Exception e) {
return 0L;
if ("llama".equals(modelType) || "mistral".equals(modelType)) {
return getLlamaLikeParameterCount();
}
return getDefaultParameterCount();
}

/**
Expand All @@ -623,9 +619,9 @@ public long getModelParameters() {
* <p>The memory required is calculated as the product of the sequence length, hidden size,
* number of hidden layers, and weight in bytes.
*
* @param sequenceLength The length in tokens of the sequence.
* @param weightBytes The weight in bytes.
* @return The memory required to store a single batch of sequence data.
* @param sequenceLength the length in tokens of the sequence
* @param weightBytes the weight in bytes
* @return the memory required to store a single batch of sequence data
*/
public long getApproxMemoryForSingleSequence(int sequenceLength, int weightBytes) {
return (long) sequenceLength * hiddenSize * numHiddenLayers * weightBytes;
Expand Down
79 changes: 27 additions & 52 deletions wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*/
package ai.djl.serving.wlm;

import ai.djl.serving.wlm.LmiUtils.HuggingFaceModelConfig;
import ai.djl.util.NeuronUtils;

import java.util.ArrayList;
Expand All @@ -26,11 +27,9 @@ public class NeuronSmartDefaultUtils {
private static final float MEMORY_PER_CORE =
16.0F; // Currently there is only one config w/ 16 gb per core

// Internal settings
private Integer nPositions;
private Integer availableCores;
private Float modelSizeInGb;
private Float sequenceSizeInGb;
private int availableCores;
private float modelSizeInGb;
private float sequenceSizeInGb;

/**
* Applies smart defaults for Neuron models.
Expand All @@ -43,10 +42,10 @@ public class NeuronSmartDefaultUtils {
* <li>option.max_rolling_batch_size: A heuristic based on available memory.
* </ul>
*
* @param prop The properties to update.
* @param modelConfig The model configuration to use.
* @param prop the properties to update
* @param modelConfig the model configuration to use
*/
public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) {
public void applySmartDefaults(Properties prop, HuggingFaceModelConfig modelConfig) {
if (!prop.containsKey("option.n_positions")) {
prop.setProperty(
"option.n_positions", String.valueOf(modelConfig.getDefaultNPositions()));
Expand All @@ -62,9 +61,9 @@ public void applySmartDefaults(Properties prop, LmiUtils.HuggingFaceModelConfig
* @param prop The properties to retrieve settings from.
* @param modelConfig The model configuration to use for calculations.
*/
private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfig modelConfig) {
clearInternalSettings();
nPositions = Integer.parseInt(prop.getProperty("option.n_positions"));
private void setInternalSettings(Properties prop, HuggingFaceModelConfig modelConfig) {
// Internal settings
int nPositions = Integer.parseInt(prop.getProperty("option.n_positions", "0"));
if (NeuronUtils.hasNeuron()) {
availableCores = NeuronUtils.getNeuronCores();
} else {
Expand All @@ -85,32 +84,13 @@ private void setInternalSettings(Properties prop, LmiUtils.HuggingFaceModelConfi
* degree. The adjustment is based on the estimated memory increase due to the tensor parallel
* degree.
*
* @param tpDegree The tensor parallel degree.
* @return The adjusted model size in GB.
* @param tpDegree the tensor parallel degree
* @return the adjusted model size in GB
*/
private float getAdjustedModelSizeInGb(int tpDegree) {
return modelSizeInGb * (1.0F + ((tpDegree * 2 - 2) / 100.0F));
}

/**
* Clears the internal settings for this NeuronSmartDefaultUtils instance.
*
* <p>This method clears the following fields:
*
* <ul>
* <li>{@link #nPositions}
* <li>{@link #availableCores}
* <li>{@link #modelSizeInGb}
* <li>{@link #sequenceSizeInGb}
* </ul>
*/
private void clearInternalSettings() {
nPositions = null;
availableCores = null;
modelSizeInGb = null;
sequenceSizeInGb = null;
}

/**
* Sets a heuristic value for tensor parallel degree if not already set in model properties.
*
Expand Down Expand Up @@ -173,8 +153,8 @@ private void setHeuristicNeuronTPDegree(Properties prop) {
/**
* Finds the largest power of 2 less than or equal to n.
*
* @param n The input number.
* @return The largest power of 2 less than or equal to n.
* @param n the input number
* @return the largest power of 2 less than or equal to n
*/
private int getMaxPowerOf2(int n) {
if (n != 0 && (n & (n - 1)) == 0) {
Expand All @@ -194,8 +174,8 @@ private int getMaxPowerOf2(int n) {
* <p>The maximum number of concurrent requests is calculated as the largest power of 2 less
* than or equal to the total memory divided by the sequence size.
*
* @param totalMemory The total memory available for the model.
* @return The maximum number of concurrent requests.
* @param totalMemory the total memory available for the model
* @return the maximum number of concurrent requests
*/
private int getMaxConcurrency(float totalMemory, int tpDegree) {
int maxConcurrency =
Expand All @@ -208,9 +188,9 @@ private int getMaxConcurrency(float totalMemory, int tpDegree) {
* Calculates the maximum number of concurrent requests that can be served by a model given the
* total memory available for the model and the sequence size.
*
* @param totalMemory The total memory available for the model.
* @param batchSize The maximum number of requests that can be processed in a single batch.
* @return The maximum number of concurrent requests that can be served.
* @param totalMemory the total memory available for the model
* @param batchSize the maximum number of requests that can be processed in a single batch
* @return the maximum number of concurrent requests that can be served
*/
private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batchSize) {
int maxConcurrency =
Expand All @@ -219,9 +199,8 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc
maxConcurrency = Math.min(maxConcurrency, batchSize);
if (maxConcurrency == batchSize) {
return maxConcurrency;
} else {
return 0;
}
return 0;
}

/**
Expand All @@ -231,7 +210,7 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc
* number of cores. This method returns a list of available core configurations for the given
* number of cores.
*
* @return The list of available core configurations.
* @return the list of available core configurations
*/
private List<Integer> availableCoreConfigs() {
List<Integer> coreConfigs = new ArrayList<>();
Expand All @@ -252,8 +231,8 @@ private List<Integer> availableCoreConfigs() {
* number of cores. This method returns a list of available core configurations for the given
* number of cores.
*
* @param nCores The number of cores to build the configurations for.
* @return The list of available core configurations.
* @param nCores the number of cores to build the configurations for
* @return the list of available core configurations
*/
private List<Integer> buildCoreConfigs(int nCores) {
List<Integer> coreConfigs = new ArrayList<>();
Expand All @@ -279,14 +258,10 @@ private List<Integer> buildCoreConfigs(int nCores) {
* @param prop The properties to set the max rolling batch size to.
*/
private void setHeuristicNeuronMaxRollingBatch(Properties prop) {
int tpDegree;
try {
tpDegree = Integer.parseInt(prop.getProperty("option.tensor_parallel_degree"));
} catch (Exception e) {
// if tensor parallel degree exists and is not an integer, it is max, use all available
// cores
tpDegree = availableCores;
}
int tpDegree =
Integer.parseInt(
prop.getProperty(
"option.tensor_parallel_degree", String.valueOf(availableCores)));
if (!prop.containsKey("option.max_rolling_batch_size")) {
int maxRollingBatchSize = getMaxConcurrency(tpDegree * MEMORY_PER_CORE, tpDegree);
if (maxRollingBatchSize > 0) {
Expand Down
Loading

0 comments on commit c00228b

Please sign in to comment.