Skip to content

Commit 2380578

Browse files
authored
Merge pull request #26 from compspec/refactor-plugin-design-add-nodes
wip: add support for node extraction -> cluster metadata
2 parents 51ffdbe + 0061aac commit 2380578

37 files changed

+1057
-235
lines changed

Diff for: .gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,7 @@
1717
# Dependency directories (remove the comment below to include it)
1818
# vendor/
1919
bin
20-
vendor
20+
vendor
21+
cache
22+
lib
23+
*.json

Diff for: Makefile.hwloc

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# This makefile will be used when we can add hwloc - there is currently a bug.
2+
HERE ?= $(shell pwd)
3+
LOCALBIN ?= $(shell pwd)/bin
4+
5+
# Install hwloc here for use to compile, etc.
6+
LOCALLIB ?= $(shell pwd)/lib
7+
HWLOC_INCLUDE ?= $(LOCALLIB)/include/hwloc.h
8+
BUILDENVVAR=CGO_CFLAGS="-I$(LOCALLIB)/include" CGO_LDFLAGS="-L$(LOCALLIB)/lib -lhwloc"
9+
10+
.PHONY: all
11+
12+
all: build
13+
14+
.PHONY: $(LOCALBIN)
15+
$(LOCALBIN):
16+
mkdir -p $(LOCALBIN)
17+
18+
.PHONY: $(LOCALLIB)
19+
$(LOCALLIB):
20+
mkdir -p $(LOCALLIB)
21+
22+
$(HWLOC_INCLUDE):
23+
git clone --depth 1 https://github.com/open-mpi/hwloc /tmp/hwloc || true && \
24+
cd /tmp/hwloc && ./autogen.sh && \
25+
./configure --enable-static --disable-shared LDFLAGS="-static" --prefix=$(LOCALLIB)/ && \
26+
make LDFLAGS=-all-static && make install
27+
28+
build: $(LOCALBIN) $(HWLOC_INCLUDE)
29+
GO111MODULE="on" $(BUILDENVVAR) go build -ldflags '-w' -o $(LOCALBIN)/compspec cmd/compspec/compspec.go
30+
31+
build-arm: $(LOCALBIN) $(HWLOC_INCLUDE)
32+
GO111MODULE="on" $(BUILDENVVAR) GOARCH=arm64 go build -ldflags '-w' -o $(LOCALBIN)/compspec-arm cmd/compspec/compspec.go
33+
34+
build-ppc: $(LOCALBIN) $(HWLOC_INCLUDE)
35+
GO111MODULE="on" $(BUILDENVVAR) GOARCH=ppc64le go build -ldflags '-w' -o $(LOCALBIN)/compspec-ppc cmd/compspec/compspec.go

Diff for: README.md

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ This is a prototype compatibility checking tool. Right now our aim is to use in
1111

1212
- I'm starting with just Linux. I know there are those "other" platforms, but if it doesn't run on HPC or Kubernetes easily I'm not super interested (ahem, Mac and Windows)!
1313
- not all extractors work in containers (e.g., kernel needs to be on the host)
14+
- The node feature discovery source doesn't provide mapping of socket -> cores, nor does it give details about logical vs. physical CPU.
15+
- We will likely want to add hwloc go bindings, but there is a bug currently.
1416

1517
Note that for development we are using nfd-source that does not require kubernetes:
1618

Diff for: cmd/compspec/compspec.go

+29-9
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,21 @@ func main() {
5454
cachePath := matchCmd.String("", "cache", &argparse.Options{Help: "A path to a cache for artifacts"})
5555
saveGraph := matchCmd.String("", "cache-graph", &argparse.Options{Help: "Load or use a cached graph"})
5656

57-
// Create arguments
58-
options := createCmd.StringList("a", "append", &argparse.Options{Help: "Append one or more custom metadata fields to append"})
59-
specname := createCmd.String("i", "in", &argparse.Options{Required: true, Help: "Input yaml that contains spec for creation"})
60-
specfile := createCmd.String("o", "out", &argparse.Options{Help: "Save compatibility json artifact to this file"})
61-
mediaType := createCmd.String("m", "media-type", &argparse.Options{Help: "The expected media-type for the compatibility artifact"})
62-
allowFailCreate := createCmd.Flag("f", "allow-fail", &argparse.Options{Help: "Allow any specific extractor to fail (and continue extraction)"})
57+
// Create subcommands - note that "nodes" could be cluster, but could want to make a subset of one
58+
artifactCmd := createCmd.NewCommand("artifact", "Create a new artifact")
59+
nodesCmd := createCmd.NewCommand("nodes", "Create nodes in Json Graph format from extraction data")
60+
61+
// Artifaction creation arguments
62+
options := artifactCmd.StringList("a", "append", &argparse.Options{Help: "Append one or more custom metadata fields to append"})
63+
specname := artifactCmd.String("i", "in", &argparse.Options{Required: true, Help: "Input yaml that contains spec for creation"})
64+
specfile := artifactCmd.String("o", "out", &argparse.Options{Help: "Save compatibility json artifact to this file"})
65+
mediaType := artifactCmd.String("m", "media-type", &argparse.Options{Help: "The expected media-type for the compatibility artifact"})
66+
allowFailCreate := artifactCmd.Flag("f", "allow-fail", &argparse.Options{Help: "Allow any specific extractor to fail (and continue extraction)"})
67+
68+
// Nodes creation arguments
69+
nodesOutFile := nodesCmd.String("", "nodes-output", &argparse.Options{Help: "Output json file for cluster nodes"})
70+
nodesDir := nodesCmd.String("", "node-dir", &argparse.Options{Required: true, Help: "Input directory with extraction data for nodes"})
71+
clusterName := nodesCmd.String("", "cluster-name", &argparse.Options{Required: true, Help: "Cluster name to describe in graph"})
6372

6473
// Now parse the arguments
6574
err := parser.Parse(os.Args)
@@ -75,10 +84,21 @@ func main() {
7584
log.Fatalf("Issue with extraction: %s\n", err)
7685
}
7786
} else if createCmd.Happened() {
78-
err := create.Run(*specname, *options, *specfile, *allowFailCreate)
79-
if err != nil {
80-
log.Fatal(err.Error())
87+
if artifactCmd.Happened() {
88+
err := create.Artifact(*specname, *options, *specfile, *allowFailCreate)
89+
if err != nil {
90+
log.Fatal(err.Error())
91+
}
92+
} else if nodesCmd.Happened() {
93+
err := create.Nodes(*nodesDir, *clusterName, *nodesOutFile)
94+
if err != nil {
95+
log.Fatal(err.Error())
96+
}
97+
} else {
98+
fmt.Println(Header)
99+
fmt.Println("Please provide a --node-dir and (optionally) --nodes-output (json file to write)")
81100
}
101+
82102
} else if matchCmd.Happened() {
83103
err := match.Run(
84104
*manifestFile,

Diff for: cmd/compspec/create/artifact.go

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package create
2+
3+
import (
4+
"strings"
5+
6+
"github.com/compspec/compspec-go/plugins/creators/artifact"
7+
)
8+
9+
// Artifact will create a compatibility artifact based on a request in YAML
10+
// TODO likely want to refactor this into a proper create plugin
11+
func Artifact(specname string, fields []string, saveto string, allowFail bool) error {
12+
13+
// This is janky, oh well
14+
allowFailFlag := "false"
15+
if allowFail {
16+
allowFailFlag = "true"
17+
}
18+
19+
// assemble options for node creator
20+
creator, err := artifact.NewPlugin()
21+
if err != nil {
22+
return err
23+
}
24+
options := map[string]string{
25+
"specname": specname,
26+
"fields": strings.Join(fields, "||"),
27+
"saveto": saveto,
28+
"allowFail": allowFailFlag,
29+
}
30+
return creator.Create(options)
31+
}

Diff for: cmd/compspec/create/nodes.go

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package create
2+
3+
import (
4+
"github.com/compspec/compspec-go/plugins/creators/cluster"
5+
)
6+
7+
// Nodes will read in one or more node extraction metadata files and generate a single nodes JGF graph
8+
// This is intentended for a registration command.
9+
// TODO this should be converted to a creation (converter) plugin
10+
func Nodes(nodesDir, clusterName, nodeOutFile string) error {
11+
12+
// assemble options for node creator
13+
creator, err := cluster.NewPlugin()
14+
if err != nil {
15+
return err
16+
}
17+
options := map[string]string{
18+
"nodes-dir": nodesDir,
19+
"cluster-name": clusterName,
20+
"node-outfile": nodeOutFile,
21+
}
22+
return creator.Create(options)
23+
}

Diff for: cmd/compspec/extract/extract.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ func Run(filename string, pluginNames []string, allowFail bool) error {
1515
// Womp womp, we only support linux! There is no other way.
1616
operatingSystem := runtime.GOOS
1717
if operatingSystem != "linux" {
18-
return fmt.Errorf("🤓️ Sorry, we only support linux.")
18+
return fmt.Errorf("🤓️ sorry, we only support linux")
1919
}
2020

2121
// parse [section,...,section] into named plugins and sections
@@ -37,7 +37,7 @@ func Run(filename string, pluginNames []string, allowFail bool) error {
3737
// This returns an array of bytes
3838
b, err := result.ToJson()
3939
if err != nil {
40-
return fmt.Errorf("There was an issue marshalling to JSON: %s\n", err)
40+
return fmt.Errorf("there was an issue marshalling to JSON: %s", err)
4141
}
4242
err = os.WriteFile(filename, b, 0644)
4343
if err != nil {

Diff for: docs/README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
This is early documentation that will be converted eventually to something prettier. Read more about:
44

5-
- [Design](design.md)
6-
- [Usage](usage.md)
7-
5+
- [Design](design.md) of compspec
6+
- [Usage](usage.md) generic use cases
7+
- [Rainbow](rainbow.md) use cases and examples for the rainbow scheduler
88

99
## Thanks and Previous Art
1010

Diff for: docs/design.md

+17-9
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,28 @@ The compatibility tool is responsible for extracting information about a system,
77

88
## Definitions
99

10-
### Extractor
10+
### Plugin
1111

12-
An **extractor** is a core plugin that knows how to retrieve metadata about a host. An extractor is usually going to be run for two cases:
12+
A plugin can define one or more functionalities;
13+
14+
- "Extract" is expected to know how to extract metadata about an application or environment
15+
- "Create" is expected to create something from extracted data
16+
17+
This means that an **extractor** is a core plugin that knows how to retrieve metadata about a host. An extractor is usually going to be run for two cases:
1318

1419
1. During CI to extract (and save) metadata about a particular build to put in a compatibility artifact.
1520
2. During image selection to extract information about the host to compare to.
1621

17-
Examples extractors could be "library" or "system."
22+
Examples extractors could be "library" or "system." You interact with extractor plugins via the "extract" command.
23+
24+
A **creator** is a plugin that is responsible for creating an artifact that includes some extracted metadata. The creator is agnostic to what it it being asked to generate in the sense that it just needs a mapping. The mapping will be from the extractor namespace to the compatibility artifact namespace. For our first prototype, this just means asking for particular extractor attributes to map to a set of annotations that we want to dump into json. To start there should only be one creator plugin needed, however if there are different structures of artifacts needed, I could imagine more. An example creation specification for a prototype experiment where we care about architecture, MPI, and GPU is provided in [examples](examples).
25+
26+
Plugins can be one or the other, or both.
1827

19-
### Section
28+
#### Section
2029

21-
A **section** is a group of metadata within an extractor. For example, within "library" a section is for "mpi." This allows a user to specify running the `--name library[mpi]` extractor to ask for the mpi section of the library family. Another example is under kernel.
30+
A **section** is a group of metadata typically within an extractor, and could also be defined for creators when we have more use cases.
31+
For example, within "library" a section is for "mpi." This allows a user to specify running the `--name library[mpi]` extractor to ask for the mpi section of the library family. Another example is under kernel.
2232
The user might want to ask for more than one group to be extracted and might ask for `--name kernel[boot,config]`. Section basically provides more granularity to an extractor namespace. For the above two examples, the metadata generated would be organized like:
2333

2434
```
@@ -31,12 +41,10 @@ kernel
3141

3242
For the above, right now I am implementing extractors generally, or "wild-westy" in the sense that the namespace is oriented toward the extractor name and sections it owns (e.g., no community namespaces like archspec, spack, opencontainers, etc). This is subject to change depending on the design the working group decides on.
3343

34-
### Creator
35-
36-
A creator is a plugin that is responsible for creating an artifact that includes some extracted metadata. The creator is agnostic to what it it being asked to generate in the sense that it just needs a mapping. The mapping will be from the extractor namespace to the compatibility artifact namespace. For our first prototype, this just means asking for particular extractor attributes to map to a set of annotations that we want to dump into json. To start there should only be one creator plugin needed, however if there are different structures of artifacts needed, I could imagine more. An example creation specification for a prototype experiment where we care about architecture, MPI, and GPU is provided in [examples](examples).
37-
3844
## Overview
3945

46+
> This was the original proposal and may be out of date.
47+
4048
The design is based on the prototype from that pull request, shown below.
4149

4250
![img/proposal-c-plugin-design.png](img/proposal-c-plugin-design.png)

Diff for: docs/img/rainbow-scheduler-register.png

75.8 KB
Loading

Diff for: docs/rainbow/README.md

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Rainbow Scheduler
2+
3+
The [rainbow scheduler](https://github.com/converged-computing/rainbow) has a registration step that requires a cluster to send over node metadata. The reason is because when a user sends a request for work, the scheduler needs to understand
4+
how to properly assign it. To do that, it needs to be able to see all the resources (clusters) available to it.
5+
6+
![../img/rainbow-scheduler-register.png](../img/rainbow-scheduler-register.png)
7+
8+
For the purposes of compspec here, we care about the registration step. This is what that includes:
9+
10+
## Registration
11+
12+
1. At registration, the cluster also sends over metadata about itself (and the nodes it has). This is going to allow for selection for those nodes.
13+
1. When submitting a job, the user no longer is giving an exact command, but a command + an image with compatibility metadata. The compatibility metadata (somehow) needs to be used to inform the cluster selection.
14+
1. At selection, the rainbow schdeuler needs to filter down cluster options, and choose a subset.
15+
- Level 1: Don't ask, just choose the top choice and submit
16+
- Level 2: Ask the cluster for TBA time or cost, choose based on that.
17+
- Job is added to that queue.
18+
19+
Specifically, this means two steps for compspec go:
20+
21+
1. A step to ask each node to extract it's own metadata, saved to a directory.
22+
2. A second step to combine those nodes into a graph.
23+
24+
Likely we will take a simple approach to do an extract for one node that captures it's metadata into Json Graph Format (JGF) and then dumps into a shared directory (we might imagine this being run with a flux job)
25+
and then some combination step.
26+
27+
## Example
28+
29+
In the example below, we will extract node level metadata with `compspec extract` and then generate the cluster JGF to send for registration with compspec create-nodes.
30+
31+
### 1. Extract Metadata
32+
33+
Let's first generate faux node metadata for a "cluster" - I will just run an extraction a few times and generate equivalent files :) This isn't such a crazy idea because it emulates nodes that are the same!
34+
35+
```bash
36+
mkdir -p ./docs/rainbow/cluster
37+
compspec extract --name library --name nfd[cpu,memory,network,storage,system] --name system[cpu,processor,arch,memory] --out ./docs/rainbow/cluster/node-1.json
38+
compspec extract --name library --name nfd[cpu,memory,network,storage,system] --name system[cpu,processor,arch,memory] --out ./docs/rainbow/cluster/node-2.json
39+
compspec extract --name library --name nfd[cpu,memory,network,storage,system] --name system[cpu,processor,arch,memory] --out ./docs/rainbow/cluster/node-3.json
40+
```
41+
42+
### 2. Create Nodes
43+
44+
Now we are going to give compspec the directory, and ask it to create nodes. This will be in JSON graph format. This outputs to the terminal:
45+
46+
```bash
47+
compspec create nodes --cluster-name cluster-red --node-dir ./docs/rainbow/cluster/
48+
```

0 commit comments

Comments
 (0)