Skip to content

Commit ea00dcf

Browse files
committed
Merge pull request #367 from Kitware/ec2-cluster
Ec2 cluster
2 parents f825b35 + 3d33e60 commit ea00dcf

File tree

18 files changed

+256
-46
lines changed

18 files changed

+256
-46
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,6 @@ bundles
55
node_modules/
66
npm-debug.log
77
*.pyc
8+
.project
9+
.pydevproject
10+
.settings/

server/taskflows/hpccloud/taskflow/paraview.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,25 @@ class ParaViewTaskFlow(cumulus.taskflow.TaskFlow):
4949
5050
"""
5151
def start(self, *args, **kwargs):
52-
53-
# Load the cluster
54-
model = ModelImporter.model('cluster', 'cumulus')
5552
user = getCurrentUser()
56-
cluster = model.load(kwargs['cluster']['_id'],
57-
user=user, level=AccessType.ADMIN)
58-
cluster = model.filter(cluster, user, passphrase=False)
59-
kwargs['cluster'] = cluster
53+
# Load the cluster
54+
cluster_id = parse('cluster._id').find(kwargs)
55+
if cluster_id:
56+
model = ModelImporter.model('cluster', 'cumulus')
57+
cluster = model.load(kwargs['cluster']['_id'],
58+
user=user, level=AccessType.ADMIN)
59+
cluster = model.filter(cluster, user, passphrase=False)
60+
kwargs['cluster'] = cluster
61+
62+
profile_id = parse('cluster.profileId').find(kwargs)
63+
if profile_id:
64+
profile_id = profile_id[0].value
65+
model = ModelImporter.model('aws', 'cumulus')
66+
profile = model.load(profile_id, user=user, level=AccessType.ADMIN)
67+
kwargs['profile'] = profile
6068

6169
super(ParaViewTaskFlow, self).start(
62-
create_paraview_job.s(self, *args, **kwargs))
70+
setup_cluster.s(self, next=create_paraview_job.s(), *args, **kwargs))
6371

6472
def terminate(self):
6573
self.run_task(paraview_terminate.s())

server/taskflows/hpccloud/taskflow/pyfr.py

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import subprocess
2323
import shutil
2424
from ConfigParser import SafeConfigParser
25+
from jsonpath_rw import parse
26+
from bson.objectid import ObjectId
2527

2628
import cumulus.taskflow
2729
from cumulus.tasks.job import download_job_input_folders, submit_job
@@ -64,18 +66,32 @@ class PyFrTaskFlow(cumulus.taskflow.TaskFlow):
6466
}
6567
}
6668
"""
67-
def start(self, *args, **kwargs):
69+
PYFR_AMI = 'ami-7def1b1d'
6870

69-
# Load the cluster
70-
model = ModelImporter.model('cluster', 'cumulus')
71+
def start(self, *args, **kwargs):
7172
user = getCurrentUser()
72-
cluster = model.load(kwargs['cluster']['_id'],
73-
user=user, level=AccessType.ADMIN)
74-
cluster = model.filter(cluster, user, passphrase=False)
75-
kwargs['cluster'] = cluster
73+
# Load the cluster
74+
cluster_id = parse('cluster._id').find(kwargs)
75+
if cluster_id:
76+
cluster_id = cluster_id[0].value
77+
model = ModelImporter.model('cluster', 'cumulus')
78+
cluster = model.load(cluster_id, user=user, level=AccessType.ADMIN)
79+
cluster = model.filter(cluster, user, passphrase=False)
80+
kwargs['cluster'] = cluster
81+
82+
profile_id = parse('cluster.profileId').find(kwargs)
83+
if profile_id:
84+
profile_id = profile_id[0].value
85+
model = ModelImporter.model('aws', 'cumulus')
86+
profile = model.load(profile_id, user=user, level=AccessType.ADMIN)
87+
kwargs['profile'] = profile
88+
89+
kwargs['next'] = setup_input.s()
90+
kwargs['ami'] = self.PYFR_AMI
7691

7792
super(PyFrTaskFlow, self).start(
78-
setup_input.s(self,*args, **kwargs))
93+
setup_cluster.s(
94+
self, *args, **kwargs))
7995

8096
def terminate(self):
8197
self.run_task(pyfr_terminate.s())
@@ -167,7 +183,8 @@ def update_config_file(task, client, *args, **kwargs):
167183
task.logger.info('%s removed.' % section)
168184

169185
backend_section = 'backend-%s' % kwargs['backend']['type']
170-
task.logger.info('Adding backend configuration for %s')
186+
task.logger.info('Adding backend configuration for %s'
187+
% kwargs['backend']['type'] )
171188
# Filter out options with no value
172189
options = {k: v for k, v in kwargs['backend'].iteritems() if v}
173190
options.pop('type', None)
@@ -191,8 +208,6 @@ def update_config_file(task, client, *args, **kwargs):
191208

192209
@cumulus.taskflow.task
193210
def setup_input(task, *args, **kwargs):
194-
task.logger.info('Input parameters: %s' % kwargs)
195-
196211
input_folder_id = kwargs['input']['folder']['id']
197212
mesh_file_id = kwargs['input']['meshFile']['id']
198213
kwargs['meshFileId'] = mesh_file_id
@@ -201,6 +216,14 @@ def setup_input(task, *args, **kwargs):
201216
if not number_of_procs:
202217
number_of_procs = kwargs.get('numberOfNodes')
203218

219+
if not number_of_procs:
220+
size = parse('cluster.config.launch.params.node_instance_count').find(kwargs)
221+
if size:
222+
number_of_procs = size[0].value + 1
223+
else:
224+
raise Exception('Unable to extract number of nodes in cluster')
225+
226+
204227
if not number_of_procs:
205228
raise Exception('Unable to determine number of mpi processes to run.')
206229

@@ -270,6 +293,24 @@ def setup_input(task, *args, **kwargs):
270293
if os.path.exists(output_dir):
271294
shutil.rmtree(output_dir)
272295

296+
# If we are running in the cloud determine backend to use
297+
if kwargs['cluster']['type'] == 'ec2':
298+
machine_spec = kwargs.get('machine')
299+
# If we have GPUs use cuda
300+
if int(machine_spec['gpu']) == 1:
301+
backend = {
302+
'type': 'cuda',
303+
'device-id': 'round-robin'
304+
}
305+
# Use OpenMP
306+
else:
307+
backend = {
308+
'type': 'openmp',
309+
'cblas': '/usr/lib/openblas-base/libblas.so'
310+
}
311+
312+
kwargs['backend'] = backend
313+
273314
update_config_file(task, client, *args, **kwargs)
274315

275316
ini_file_id = kwargs['input']['iniFile']['id']
@@ -302,7 +343,10 @@ def create_job(task, *args, **kwargs):
302343
'path': 'input'
303344
}
304345
],
305-
'output': []
346+
'output': [],
347+
'params': {
348+
'numberOfSlots': kwargs['numberOfProcs']
349+
}
306350
}
307351

308352
client = _create_girder_client(
@@ -334,7 +378,8 @@ def submit_pyfr_job(task, cluster, job, *args, **kwargs):
334378
task.logger.info('Submitting job %s to cluster.' % job['_id'])
335379
girder_token = task.taskflow.girder_token
336380

337-
job['params'] = kwargs
381+
job['params'].update(kwargs)
382+
338383
submit_job(cluster, job, log_write_url=None,
339384
girder_token=girder_token, monitor=False)
340385

@@ -391,7 +436,10 @@ def create_export_job(task, job_name, files, job_dir, mesh_filename):
391436
'name': job_name,
392437
'commands': commands,
393438
'input': [],
394-
'output': []
439+
'output': [],
440+
'params': {
441+
'numberOfSlots': 1
442+
}
395443
}
396444

397445
client = _create_girder_client(

server/taskflows/hpccloud/taskflow/utility/__init__.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import json
22
from jsonpath_rw import parse
3+
from celery.canvas import Signature
34

5+
import cumulus
46
from cumulus.tasks.job import terminate_job
57
from cumulus.constants import JobState
68

9+
from girder_client import GirderClient, HttpError
710

811
def terminate_jobs(task, client, cluster, jobs):
912
for job in jobs:
@@ -32,3 +35,98 @@ def get_cluster_job_output_dir(cluster):
3235

3336
return job_output_dir
3437

38+
def create_girder_client(girder_api_url, girder_token):
39+
client = GirderClient(apiUrl=girder_api_url)
40+
client.token = girder_token
41+
42+
return client
43+
44+
def create_ec2_cluster(task, cluster, profile, ami):
45+
machine_type = cluster['machine']['id']
46+
nodeCount = cluster['clusterSize']-1
47+
launch_spec = 'ec2'
48+
launch_params = {
49+
'master_instance_type': machine_type,
50+
'master_instance_ami': ami,
51+
'node_instance_count': nodeCount,
52+
'node_instance_type': machine_type,
53+
'node_instance_ami': ami
54+
}
55+
provision_spec = 'gridengine/site'
56+
provision_params = {
57+
'ansible_ssh_user': 'ubuntu'
58+
}
59+
60+
body = {
61+
'type': 'ec2',
62+
'name': cluster['name'],
63+
'config': {
64+
'launch': {
65+
'spec': launch_spec,
66+
'params': launch_params
67+
},
68+
'provision': {
69+
'spec': provision_spec
70+
}
71+
},
72+
'profileId': cluster['profileId']
73+
}
74+
client = create_girder_client(
75+
task.taskflow.girder_api_url, task.taskflow.girder_token)
76+
77+
try:
78+
cluster = client.post('clusters', data=json.dumps(body))
79+
except HttpError as he:
80+
task.logger.exception(he.responseText)
81+
raise
82+
83+
msg = 'Created cluster: %s' % cluster['_id']
84+
task.taskflow.logger.info(msg)
85+
task.logger.info(msg)
86+
87+
# Now save cluster id in metadata
88+
task.taskflow.set_metadata('cluster', cluster)
89+
90+
task.logger.info('Starting cluster.')
91+
92+
body = {
93+
'status': 'launching'
94+
}
95+
client.patch('clusters/%s' % cluster['_id'], data=json.dumps(body))
96+
97+
secret_key = profile['secretAccessKey']
98+
log_write_url = '%s/clusters/%s/log' % (task.taskflow.girder_api_url,
99+
cluster['_id'])
100+
provision_params['cluster_state'] = 'running'
101+
launch_params['cluster_state'] = 'running'
102+
girder_token = task.taskflow.girder_token
103+
cumulus.ansible.tasks.cluster.start_cluster(
104+
launch_spec, provision_spec, cluster, profile, secret_key,
105+
launch_params, provision_params, girder_token, log_write_url,
106+
master_name='head')
107+
108+
# Get the update to date cluster
109+
cluster = client.get('clusters/%s' % cluster['_id'])
110+
111+
return cluster
112+
113+
@cumulus.taskflow.task
114+
def setup_cluster(task, *args,**kwargs):
115+
cluster = kwargs['cluster']
116+
117+
if '_id' in cluster:
118+
task.taskflow.logger.info('We are using an existing cluster: %s' % cluster['name'])
119+
else:
120+
task.taskflow.logger.info('We are creating an EC2 cluster.')
121+
task.logger.info('Cluster name %s' % cluster['name'])
122+
kwargs['machine'] = cluster.get('machine')
123+
ami = kwargs.get('ami')
124+
profile = kwargs.get('profile')
125+
cluster = create_ec2_cluster(task, cluster, profile, ami)
126+
task.logger.info('Cluster started.')
127+
128+
# Call any follow on task
129+
if 'next' in kwargs:
130+
kwargs['cluster'] = cluster
131+
next = Signature.from_dict(kwargs['next'])
132+
next.delay(*args, **kwargs)

src/StateTransitionBehavior.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ export function handleTaskflowChange(state, taskflow) {
7777
}
7878
}
7979

80+
// for taskflows on ec2 the meta object is not as readily available
81+
// this is due to fewer jobs coming through SSE which triggers a fetch for trad clusters.
82+
if (!taskflow.flow.meta) {
83+
dispatch(TaskflowActions.fetchTaskflow(taskflow.flow._id));
84+
}
85+
8086
// Update taslkfow meta
8187
if (allComplete !== taskflow.allComplete ||
8288
outputDirectory[0] !== taskflow.outputDirectory ||

src/pages/Preferences/AWS/AWSForm.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ export default React.createClass({
4343
const data = nextProps.data,
4444
oldData = this.props.data;
4545

46+
if (!nextProps.data._id) {
47+
this.refs.nameInput.focus();
48+
}
49+
4650
if (!deepEquals(data, oldData)) {
4751
this.setState({ data });
4852
}
@@ -90,6 +94,7 @@ export default React.createClass({
9094
onChange={this.formChange}
9195
disabled={this.state.data._id}
9296
required
97+
ref="nameInput"
9398
/>
9499
</section>
95100
<section className={style.group}>

src/pages/Preferences/Cluster/ClusterForm.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ export default React.createClass({
3838
onChange: React.PropTypes.func,
3939
},
4040

41+
componentWillReceiveProps(nextProps) {
42+
if (!nextProps.data._id) {
43+
this.refs.nameInput.focus();
44+
}
45+
},
46+
4147
formChange(event) {
4248
const propName = event.target.dataset.key;
4349
const value = event.target.value;
@@ -81,6 +87,7 @@ export default React.createClass({
8187
data-key="name"
8288
onChange={this.formChange}
8389
required
90+
ref="nameField"
8491
/>
8592
</section>
8693
<section className={style.group}>

src/pages/Preferences/Cluster/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ const ClusterPrefs = React.createClass({
8484

8585
clusterHasSimulation(id) {
8686
for (let i = 0; i < this.props.taskflows.length; i++) {
87-
if (this.props.taskflows[i].flow.meta.cluster._id === id) {
87+
if (this.props.taskflows[i].flow && this.props.taskflows[i].flow.meta.cluster._id === id) {
8888
return this.props.taskflows[i].simulation;
8989
}
9090
}

src/panels/JobMonitor/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ export default connect(
166166
// Sort the tasks by created timestamp
167167
tasks.sort((task1, task2) => Date.parse(task1.created) > Date.parse(task2.created));
168168

169-
if (cluster.log && cluster.log.length) {
169+
if (cluster && cluster.log && cluster.log.length) {
170170
clusterLog = cluster.log.sort((task1, task2) => Date.parse(task1.created) > Date.parse(task2.created));
171171
clusterLogStreamState = cluster.logStream ? cluster.logStream.readyState : CLOSED;
172172
}

src/panels/run/RunEC2.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ export default React.createClass({
5151

5252
if (key === 'profile') {
5353
value = this.state.profiles[value];
54+
} else if (key === 'machine') {
55+
value = machines[value];
5456
}
5557

5658
if (this.props.onChange) {
@@ -67,7 +69,7 @@ export default React.createClass({
6769
var machineMapper = (machine, index) =>
6870
<option
6971
key={machine.id}
70-
value={machine.id}
72+
value={index}
7173
>
7274
{ `${machine.name} - ${machine.cpu} core${machine.cpu > 1 ? 's' : ''} - ${machine.memory}GB ${machine.gpu ? ' + GPU' : ''} - ${machine.storage}` }
7375
</option>;
@@ -108,7 +110,7 @@ export default React.createClass({
108110
<select
109111
onChange={this.dataChange} className={style.input}
110112
data-key="machine"
111-
defaultValue={machines[0].id}
113+
defaultValue={machines[0]}
112114
>
113115
{machines.map(machineMapper)}
114116
</select>

0 commit comments

Comments
 (0)