-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtum-cluster-config.yaml
More file actions
55 lines (55 loc) · 1.47 KB
/
tum-cluster-config.yaml
File metadata and controls
55 lines (55 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
protocolVersion: 2
name: nerfstudio_b5b9de27
type: job
jobRetryCount: 0
prerequisites:
- type: dockerimage
uri: 'ghcr.io/nerfstudio-project/nerfstudio:1.1.5'
name: docker_image
taskRoles:
taskrole:
instances: 1
completion:
minFailedInstances: 1
taskRetryCount: 0
dockerImage: docker_image
resourcePerInstance:
gpu: 1
cpu: 4
memoryMB: 30000
ports:
jupyter: 1
nodeSelectionPerInstance:
- key: kubernetes.io/hostname
operator: In
values:
- node13.garching.cluster.campar.in.tum.de # RTX 3090
- node14.garching.cluster.campar.in.tum.de # RTX 3090
- node05.garching.cluster.campar.in.tum.de # RTX 2080-Ti
- node12.garching.cluster.campar.in.tum.de # RTX 2080
- node08.garching.cluster.campar.in.tum.de # RTX 2080-Ti
- node10.garching.cluster.campar.in.tum.de # RTX 2080-Ti
commands:
- sleep infinity
defaults:
virtualCluster: default
extras:
com.microsoft.pai.runtimeplugin:
- plugin: ssh
parameters:
jobssh: true
userssh:
type: custom
value: ''
- plugin: teamwise_storage
parameters:
storageConfigNames:
- ceph-datasets
- ceph-projects
- ceph-scratch
jobStatusChangeNotification:
running: false
succeeded: false
stopped: false
failed: true
retried: false