Skip to content

Commit

Permalink
simplify job cost and penalty calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
cmelone committed Oct 16, 2024
1 parent b3c5b7b commit fc688c4
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 21 deletions.
30 changes: 16 additions & 14 deletions gantry/clients/prometheus/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,19 +248,21 @@ async def get_costs(
cost_per_cpu = (node_cost * 0.5) / cores
# cost of each unit of memory (byte)
cost_per_mem = (node_cost * 0.5) / mem
# these utilization ratios determine the extent to which resources
# were misallocated for this job
cpu_util_ratio = usage["cpu_mean"] / resources["cpu_request"]
mem_util_ratio = usage["mem_mean"] / resources["mem_request"]

# compute separate costs for cpu and memory usage
# using requests instead of actual usage because requests are *guaranteed*
costs["cpu_cost"] = cost_per_cpu * resources["cpu_request"]
costs["mem_cost"] = cost_per_mem * resources["mem_request"]
# penalty factors meant to discourage underallocation, which slows down jobs
# or overallocation, which prevents jobs from being scheduled on the same node

costs["cpu_penalty"] = max(1 / cpu_util_ratio, cpu_util_ratio)
costs["mem_penalty"] = max(1 / mem_util_ratio, mem_util_ratio)

# base cost of a job is the resources it consumed (usage)
costs["cpu_cost"] = usage["cpu_mean"] * cost_per_cpu
costs["mem_cost"] = usage["mem_mean"] * cost_per_mem

# penalty factors are meant to capture misallocation, or the
# opportunity cost of the job's behavior on the cluster/node
# underallocation delays scheduling of other jobs, increasing pipeline duration
# overallocation interferes with the work of other jobs and crowds the node
# the penalty is the absolute difference between the job's usage and request
costs["cpu_penalty"] = (
abs(usage["cpu_mean"] - resources["cpu_request"]) * cost_per_cpu
)
costs["mem_penalty"] = (
abs(usage["mem_mean"] - resources["mem_request"]) * cost_per_mem
)

return costs
2 changes: 1 addition & 1 deletion gantry/tests/defs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

# used to compare successful insertions
# run SELECT * FROM table_name WHERE id = 1; from python sqlite api and grab fetchone() result
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255)
INSERTED_JOB = (1, 'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z', 1, 1706117046, 1706118420, 9892514, 'success', 'pr42264_bugfix/mathomp4/hdf5-appleclang15', 'gmsh', '4.8.4', '{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}', 'gcc', '11.4.0', 'linux-ubuntu20.04-x86_64_v3', 'e4s', 16, 0.75, None, 1.899768349523097, 0.2971597591741076, 4.128116379389054, 0.2483743618267752, 1.7602635378120381, 2000000000.0, 48000000000.0, 143698407.6190476, 2785280.0, 594620416.0, 2785280.0, 252073065.82263485, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045)
INSERTED_NODE = (1, 'ec253b04-b1dc-f08b-acac-e23df83b3602', 'ip-192-168-86-107.ec2.internal', 24.0, 196608000000.0, 'amd64', 'linux', 'i3en.6xlarge', 'us-east-1c', 'spot')

# these were obtained by executing the respective queries to Prometheus and capturing the JSON output
Expand Down
2 changes: 1 addition & 1 deletion gantry/tests/sql/insert_job.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671,0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(1,'runner-hwwb-i3u-project-2-concurrent-1-s10tq41z',2,1706117046,1706118420,9892514,'success','pr42264_bugfix/mathomp4/hdf5-appleclang15','gmsh','4.8.4','{"alglib": true, "cairo": false, "cgns": true, "compression": true, "eigen": false, "external": false, "fltk": true, "gmp": true, "hdf5": false, "ipo": false, "med": true, "metis": true, "mmg": true, "mpi": true, "netgen": true, "oce": true, "opencascade": false, "openmp": false, "petsc": false, "privateapi": false, "shared": true, "slepc": false, "tetgen": true, "voropp": true, "build_system": "cmake", "build_type": "Release", "generator": "make"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',16,0.75,NULL,4.12532286694540495,3.15805864677520409,11.6038107294648877,0.248374361826775191,3.34888880339475214,2000000000.0,48000000000.0,1649868862.72588062,999763968.0,5679742976.0,2785280.0,1378705563.21018671,0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);
10 changes: 5 additions & 5 deletions gantry/tests/sql/insert_samples.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
INSERT INTO nodes VALUES(6789,'ec2c47a0-7e9b-cfa3-9ad4-ac227ade598d','ip-192-168-202-150.ec2.internal',32.0,131072000000.0,'amd64','linux','m5.8xlarge','us-east-1c','spot');
INSERT INTO jobs VALUES(6781,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi1',6789,1708919572.983000041,1708924744.811000108,101502092,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,9.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9652098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(6782,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi2',6789,1708919572.983000041,1708924744.811000108,101502093,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,10.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9958098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(6783,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi3',6789,1708919572.983000041,1708924744.811000108,101502094,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,11.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9158098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(6784,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi4',6789,1708919572.983000041,1708924744.811000108,101502095,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,12.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9758098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(6785,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi5',6789,1708919572.983000041,1708924744.811000108,101502096,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,13.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9358098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.002981770833333333, 0.0009706285264756945, 2.533024466030796, 13.918038711341255);
INSERT INTO jobs VALUES(6781,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi1',6789,1708919572.983000041,1708924744.811000108,101502092,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,9.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9652098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);
INSERT INTO jobs VALUES(6782,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi2',6789,1708919572.983000041,1708924744.811000108,101502093,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,10.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9958098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);
INSERT INTO jobs VALUES(6783,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi3',6789,1708919572.983000041,1708924744.811000108,101502094,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,11.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9158098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);
INSERT INTO jobs VALUES(6784,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi4',6789,1708919572.983000041,1708924744.811000108,101502095,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,12.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9758098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);
INSERT INTO jobs VALUES(6785,'runner-2j2ndhxu-project-2-concurrent-0-nbogpypi5',6789,1708919572.983000041,1708924744.811000108,101502096,'success','develop','py-torch','2.2.1','{"caffe2": false, "cuda": true, "cudnn": true, "debug": false, "distributed": true, "fbgemm": true, "gloo": true, "kineto": true, "magma": false, "metal": false, "mkldnn": true, "mpi": true, "nccl": false, "nnpack": true, "numa": true, "numpy": true, "onnx_ml": true, "openmp": true, "qnnpack": true, "rocm": false, "tensorpipe": true, "test": false, "valgrind": true, "xnnpack": true, "build_system": "python_pip", "cuda_arch": "80"}','gcc','11.4.0','linux-ubuntu20.04-x86_64_v3','e4s',12,12.0,NULL,13.77948152336477605,11.98751586519425772,12.00060520666194109,0.3736576704015182604,3.811106184376615414,48000000000.0,64000000000.0,9358098890.24199867,7399608320.0,41186873344.0,85508096.0,8707419891.779100419, 0.007552898472930367, 6.973888682208994e-05, 0.0045711276395970345, 0.0009008896396536045);

0 comments on commit fc688c4

Please sign in to comment.