converged-computing
diff --git a/‎api/openapi-spec/swagger.json‎
Lines changed: 27 additions & 0 deletions b/‎api/openapi-spec/swagger.json‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎api/python_api/kubeflow_trainer_api/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎api/python_api/kubeflow_trainer_api/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_flux_ml_policy_source.py‎
Lines changed: 87 additions & 0 deletions b/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_flux_ml_policy_source.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_hpcml_policy_source.py‎
Lines changed: 91 additions & 0 deletions b/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_hpcml_policy_source.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_ml_policy.py‎
Lines changed: 7 additions & 1 deletion b/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_ml_policy.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_ml_policy_source.py‎
Lines changed: 7 additions & 1 deletion b/‎api/python_api/kubeflow_trainer_api/models/trainer_v1alpha1_ml_policy_source.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions b/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions b/‎charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/flux/lammps-train-job.yaml‎
Lines changed: 32 additions & 0 deletions b/‎examples/flux/lammps-train-job.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions b/‎manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml‎
Lines changed: 10 additions & 0 deletions
@@ -47,6 +47,16 @@ spec:
                 description: mlPolicy provides the ML-specific parameters for the
                   model training.
                 properties:
+                  flux:
+                    description: flux defines the configuration for the Flux runtime.
+                    properties:
+                      numProcPerNode:
+                        default: 1
+                        description: numProcPerNode is the number of processes per
+                          node.
+                        format: int32
+                        type: integer
+                    type: object
                   mpi:
                     description: mpi defines the configuration for the MPI Runtime.
                     properties:
 
@@ -47,6 +47,16 @@ spec:
                 description: mlPolicy provides the ML-specific parameters for the
                   model training.
                 properties:
+                  flux:
+                    description: flux defines the configuration for the Flux runtime.
+                    properties:
+                      numProcPerNode:
+                        default: 1
+                        description: numProcPerNode is the number of processes per
+                          node.
+                        format: int32
+                        type: integer
+                    type: object
                   mpi:
                     description: mpi defines the configuration for the MPI Runtime.
                     properties:
 
@@ -0,0 +1,32 @@
+# This example deploys the LAMMPS Molecular Dynamic Simulator
+# with MPI orchestrated by the Flux workload manager on 4 nodes.
+# The problem size is defined by the coordinates x,y,z, and the
+# parameter file reaxc.hns.
+# The image has the application, LAMMPS, installed (no Flux)
+# A Flux view will be added on the fly by the Kubeflow trainer
+# The 4 pods ideally map 1:1 to nodes, encompassing a cluster
+# The underlying abstraction is a JobSet with a headless service
+# Flux supports low-latency with Infiniband, EFA, etc., however
+# standard ethernet is used here.
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: lammps-flux-interactive
+spec:
+  # Reference the pre-defined runtime by name
+  runtimeRef:
+    name: flux-runtime
+  trainer:
+    numNodes: 4
+    image: ghcr.io/converged-computing/metric-lammps:latest
+    # You do not need to write "flux run, etc" here. It will be wrapped
+    command: ["lmp", "-v", "x", "2", "-v", "y", "2", "-v", "z", "2", "-in", "in.reaxc.hns", "-nocite"]
+    # Comment out the command above to make an interactive cluster! Then shell into the 0-0 pod:
+    #  # Source environment
+    #  . /mnt/flux/flux-view.sh
+    #  # Connect to the running lead broker socket
+    #  flux proxy $fluxsocket bash
+    #  # See Flux resources!
+    #  flux resource list
+    #  Run lammps!
+    #  flux run -N 4 -n 4 lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
@@ -47,6 +47,16 @@ spec:
                 description: mlPolicy provides the ML-specific parameters for the
                   model training.
                 properties:
+                  flux:
+                    description: flux defines the configuration for the Flux runtime.
+                    properties:
+                      numProcPerNode:
+                        default: 1
+                        description: numProcPerNode is the number of processes per
+                          node.
+                        format: int32
+                        type: integer
+                    type: object
                   mpi:
                     description: mpi defines the configuration for the MPI Runtime.
                     properties: