Skip to content

Commit

Permalink
[docker][neuron] Version bumps for vllm 0.6.0 (#2379)
Browse files Browse the repository at this point in the history
  • Loading branch information
tosterberg authored Sep 12, 2024
1 parent 1d6d763 commit d0873f0
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 24 deletions.
9 changes: 4 additions & 5 deletions engines/python/setup/djl_python/neuron_utils/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,10 @@ def can_use_continuous_batching(self) -> bool:
:return: bool indicating if continuous batching can be used
"""
use_continuous_batching = (self.config.rolling_batch != "disable"
and self.config.rolling_batch_strategy
== TnXGenerationStrategy.continuous_batching
and self.config.max_rolling_batch_size
> 1) or self.config.rolling_batch == "vllm"
use_continuous_batching = (
self.config.rolling_batch != "disable"
and self.config.rolling_batch_strategy
== TnXGenerationStrategy.continuous_batching)
return use_continuous_batching

def set_neuron_config(self) -> None:
Expand Down
9 changes: 5 additions & 4 deletions serving/docker/pytorch-inf2.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ ARG transformers_neuronx_version=0.11.351
ARG neuronx_distributed_version=0.8.0
ARG neuronx_cc_version=2.14.227.0
ARG protobuf_version=3.19.6
ARG transformers_version=4.43.1
ARG transformers_version=4.43.2
ARG accelerate_version=0.29.2
ARG diffusers_version=0.28.2
ARG pydantic_version=2.6.1
ARG optimum_neuron_version=0.0.24
# %2B is the url escape for the '+' character
ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.5.0%2Bnightly-py3-none-any.whl"
ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.6.0%2Bnightly-py3-none-any.whl"
EXPOSE 8080

# Sets up Path for Neuron tools
Expand Down Expand Up @@ -73,12 +73,13 @@ RUN mkdir -p /opt/djl/bin && cp scripts/telemetry.sh /opt/djl/bin && \
scripts/install_djl_serving.sh $djl_version && \
scripts/install_djl_serving.sh $djl_version ${torch_version} && \
scripts/install_inferentia2.sh && \
pip install accelerate==${accelerate_version} safetensors ${vllm_wheel} torchvision==${torchvision_version} \
pip install accelerate==${accelerate_version} safetensors torchvision==${torchvision_version} \
neuronx-cc==${neuronx_cc_version} torch-neuronx==${torch_neuronx_version} transformers-neuronx==${transformers_neuronx_version} \
neuronx_distributed==${neuronx_distributed_version} protobuf==${protobuf_version} sentencepiece jinja2 \
diffusers==${diffusers_version} opencv-contrib-python-headless Pillow --extra-index-url=https://pip.repos.neuron.amazonaws.com \
pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile && \
pip install transformers==${transformers_version} && \
pip install transformers==${transformers_version} ${vllm_wheel} && \
echo y | pip uninstall triton && \
scripts/install_s5cmd.sh x64 && \
scripts/patch_oss_dlc.sh python && \
useradd -m -d /home/djl djl && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,12 @@ private static void setRollingBatch(
// Non text-generation use-cases are not compatible with rolling batch
rollingBatch = "disable";
} else if (isTnxEnabled(features)) {
rollingBatch = "tnx";
if (Integer.parseInt(lmiProperties.getProperty("option.max_rolling_batch_size", "1"))
>= 12) {
rollingBatch = "vllm";
} else {
rollingBatch = "tnx";
}
} else if (isLmiDistEnabled(features)
&& "lmi-dist".equals(MODEL_TO_ROLLING_BATCH.get(modelType))) {
rollingBatch = "lmi-dist";
Expand Down
39 changes: 32 additions & 7 deletions wlm/src/main/java/ai/djl/serving/wlm/NeuronSmartDefaultUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
import ai.djl.serving.wlm.LmiUtils.HuggingFaceModelConfig;
import ai.djl.util.NeuronUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
Expand All @@ -23,14 +26,17 @@
public class NeuronSmartDefaultUtils {

private static final float BILLION = 1_000_000_000.0F;
private static final int MAX_ROLLING_BATCH = 128; // Current cap for NeuronSDK 2.19.1
private static final int MAX_ROLLING_BATCH =
32; // Current best throughput and latency balance batch size
private static final float MEMORY_PER_CORE =
16.0F; // Currently there is only one config w/ 16 gb per core

private int availableCores;
private float modelSizeInGb;
private float sequenceSizeInGb;

private static final Logger logger = LoggerFactory.getLogger(NeuronSmartDefaultUtils.class);

/**
* Applies smart defaults for Neuron models.
*
Expand All @@ -53,6 +59,7 @@ public void applySmartDefaults(Properties prop, HuggingFaceModelConfig modelConf
}
prop.setProperty(
"option.n_positions", String.valueOf(modelConfig.getDefaultNPositions()));
logger.info("[Smart Default] N_POSITIONS: {}.", prop.getProperty("option.n_positions"));
}
setInternalSettings(prop, modelConfig);
setHeuristicNeuronTPDegree(prop);
Expand All @@ -77,6 +84,7 @@ private void setInternalSettings(Properties prop, HuggingFaceModelConfig modelCo
modelSizeInGb = (paramBytes * modelConfig.getModelParameters()) / BILLION;
sequenceSizeInGb =
modelConfig.getApproxMemoryForSingleSequence(nPositions, paramBytes)
* 0.95F
/ (1024.0F * 1024.0F * 1024.0F);
}

Expand Down Expand Up @@ -119,6 +127,9 @@ private void setHeuristicNeuronTPDegree(Properties prop) {
if (prop.containsKey("option.tensor_parallel_degree")
&& "max".equals(prop.getProperty("option.tensor_parallel_degree"))) {
prop.setProperty("option.tensor_parallel_degree", String.valueOf(availableCores));
logger.info(
"[Smart Default] TENSOR_PARALLEL_DEGREE:" + " {}.",
prop.getProperty("option.tensor_parallel_degree"));
return;
}

Expand All @@ -130,27 +141,36 @@ private void setHeuristicNeuronTPDegree(Properties prop) {
int totalInstanceConcurrency = getMaxConcurrency(totalMemory, tpDegree);
for (int coreConfig : coreConfigs) {
float maxMemory = coreConfig * MEMORY_PER_CORE;
int maxConcurrency = getMaxConcurrency(maxMemory, coreConfig);
int maxConcurrency =
getMaxConcurrency(maxMemory, coreConfig) * (availableCores / coreConfig);
if (maxConcurrency >= totalInstanceConcurrency && coreConfig <= tpDegree) {
tpDegree = coreConfig;
totalInstanceConcurrency = maxConcurrency;
}
}
prop.setProperty("option.tensor_parallel_degree", String.valueOf(tpDegree));
logger.info(
"[Smart Default] TENSOR_PARALLEL_DEGREE:" + " {}.",
prop.getProperty("option.tensor_parallel_degree"));
} else if (!prop.containsKey("option.tensor_parallel_degree")) {
// Set tensor parallel degree by minimizing TP degree that supports fixed batch size
int batchSize = Integer.parseInt(prop.getProperty("option.max_rolling_batch_size"));
int totalInstanceConcurrency =
getMaxConcurrencyWithBatch(totalMemory, tpDegree, batchSize);
for (int coreConfig : coreConfigs) {
float maxMemory = coreConfig * MEMORY_PER_CORE;
int maxConcurrency = getMaxConcurrencyWithBatch(maxMemory, coreConfig, batchSize);
int maxConcurrency =
getMaxConcurrencyWithBatch(maxMemory, coreConfig, batchSize)
* (availableCores / coreConfig);
if (maxConcurrency >= totalInstanceConcurrency && coreConfig <= tpDegree) {
tpDegree = coreConfig;
totalInstanceConcurrency = maxConcurrency;
}
}
prop.setProperty("option.tensor_parallel_degree", String.valueOf(tpDegree));
logger.info(
"[Smart Default] TENSOR_PARALLEL_DEGREE: {}.",
prop.getProperty("option.tensor_parallel_degree"));
}
}

Expand Down Expand Up @@ -222,9 +242,9 @@ private int getMaxConcurrencyWithBatch(float totalMemory, int tpDegree, int batc
private List<Integer> availableCoreConfigs() {
List<Integer> coreConfigs = new ArrayList<>();
List<Integer> availableCoreConfigs = buildCoreConfigs(availableCores);
int coresPerModel = (int) Math.ceil(modelSizeInGb / MEMORY_PER_CORE);
int coresPerModel = (int) Math.ceil(1.1F * modelSizeInGb / MEMORY_PER_CORE);
for (int coreConfig : availableCoreConfigs) {
if (coresPerModel >= coreConfig) {
if (coresPerModel <= coreConfig) {
coreConfigs.add(coreConfig);
}
}
Expand All @@ -250,8 +270,10 @@ private List<Integer> buildCoreConfigs(int nCores) {
coreConfigs.add(i);
}
}
// Add the given number of cores to the list
coreConfigs.add(nCores);
// Add the given number of cores to the list if not already added
if (nCores > 8) {
coreConfigs.add(nCores);
}
return coreConfigs;
}

Expand All @@ -274,6 +296,9 @@ private void setHeuristicNeuronMaxRollingBatch(Properties prop) {
if (maxRollingBatchSize > 0) {
prop.setProperty(
"option.max_rolling_batch_size", String.valueOf(maxRollingBatchSize));
logger.info(
"[Smart Default] MAX_ROLLING_BATCH_SIZE: {}.",
prop.getProperty("option.max_rolling_batch_size"));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void testApplySmartDefaultsQuantize8BModel() throws IOException {
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "4096");
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "8");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "16");
}

@Test
Expand All @@ -118,7 +118,7 @@ public void testApplySmartDefaults2BModel() throws IOException {
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "2048");
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "64");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "32");
}

@Test
Expand All @@ -133,7 +133,7 @@ public void testApplySmartDefaultsQuantize2BModel() throws IOException {
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "2048");
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "128");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "32");
}

@Test
Expand All @@ -147,7 +147,7 @@ public void testApplySmartDefaultsWithNPositions() throws IOException {
smartDefaultUtils.applySmartDefaults(prop, modelConfig);
}
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "128");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "32");
}

@Test
Expand All @@ -161,7 +161,7 @@ public void testApplySmartDefaultsWithTPDegree() throws IOException {
smartDefaultUtils.applySmartDefaults(prop, modelConfig);
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "2048");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "64");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "32");
}

@Test
Expand Down Expand Up @@ -190,11 +190,26 @@ public void testApplySmartDefaultsWithTPMax() throws IOException {
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "2048");
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "1");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "64");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "32");
}

@Test
public void testApplySmartDefaultsWithNeuron8bModel() throws IOException {
Properties prop = new Properties();
LmiUtils.HuggingFaceModelConfig modelConfig = get8BLlamaHuggingFaceModelConfig();
try (MockedStatic<NeuronUtils> mockedStatic = Mockito.mockStatic(NeuronUtils.class)) {
mockedStatic.when(NeuronUtils::hasNeuron).thenReturn(true);
mockedStatic.when(NeuronUtils::getNeuronCores).thenReturn(32);
NeuronSmartDefaultUtils smartDefaultUtils = new NeuronSmartDefaultUtils();
smartDefaultUtils.applySmartDefaults(prop, modelConfig);
}
Assert.assertEquals(prop.getProperty("option.n_positions"), "4096");
Assert.assertEquals(prop.getProperty("option.tensor_parallel_degree"), "2");
Assert.assertEquals(prop.getProperty("option.max_rolling_batch_size"), "16");
}

@Test
public void testApplySmartDefaultsWithNeuron() throws IOException {
public void testApplySmartDefaultsWithNeuron70bModel() throws IOException {
Properties prop = new Properties();
LmiUtils.HuggingFaceModelConfig modelConfig = get70BLlamaHuggingFaceModelConfig();
try (MockedStatic<NeuronUtils> mockedStatic = Mockito.mockStatic(NeuronUtils.class)) {
Expand Down

0 comments on commit d0873f0

Please sign in to comment.