Skip to content

Commit f0b50ba

Browse files
authored
Merge Release 2.4.1
Merge Release 2.4.1
2 parents e94e9c2 + 0b49147 commit f0b50ba

36 files changed

+286
-548
lines changed

CHANGELOG.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,43 @@ aws-parallelcluster-cookbook CHANGELOG
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
55

6+
2.4.1
7+
-----
8+
9+
**ENHANCEMENTS**
10+
- Install IntelMPI on Alinux, Centos 7 and Ubuntu 1604
11+
- Upgrade EFA to version 1.4.1
12+
- Run all node daemons and cookbook recipes in isolated Python virtualenvs. This allows our code to always run with the
13+
required Python dependencies and solves all conflicts and runtime failures that were being caused by user packages
14+
installed in the system Python
15+
16+
**CHANGES**
17+
- Torque: upgrade to version 6.1.2
18+
- Run all node daemons with Python 3.6
19+
- Torque: changed following parameters in global configuration:
20+
- `server node_check_rate = 120` - Specifies the minimum duration (in seconds)
21+
that a node can fail to send a status update before being marked down by the
22+
pbs_server daemon. Previously was 600. This reduces scaling reaction times in
23+
case of instance failure or unexpected termination (especially with spot)
24+
- `server node_ping_rate = 60` - Specifies the maximum interval (in seconds)
25+
between successive "pings" sent from the pbs_server daemon to the pbs_mom
26+
daemon to determine node/daemon health. Previously was 300. Setting it to half
27+
the node_check_rate.
28+
- `server timeout_for_job_delete = 30` - The specific timeout used when deleting
29+
jobs because the node they are executing on is being deleted. Previously was
30+
120. This prevents job deletion to hang for more than 30 seconds when the node
31+
they are running on is being deleted.
32+
- `server timeout_for_job_requeue = 30` - The specific timeout used when requeuing
33+
jobs because the node they are executing on is being deleted. Previously was
34+
120. This prevents node deletion to hang for more than 30 seconds when a job
35+
cannot be rescheduled.
36+
37+
**BUG FIXES**
38+
- Restore correct value for `filehandle_limit` that was getting reset when setting `memory_limit` for EFA
39+
- Torque: fix configuration of server operators that was preventing compute nodes from disabling themselves
40+
before termination
41+
42+
643
2.4.0
744
-----
845

amis/build_ami.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
# build-date: timestamp to append to the AMIs names (optional)
2121

2222
requirements_check() {
23-
which packer >/dev/null 2>&1
23+
packer build --help >/dev/null 2>&1
2424
if [ $? -ne 0 ] ; then
2525
echo "packer command not found. Is Packer installed?"
2626
echo "Please visit https://www.packer.io/downloads.html for instruction on how to download and install"
2727
exit 1
2828
fi
2929

30-
which berks >/dev/null 2>&1
30+
berks vendor --help >/dev/null 2>&1
3131
if [ $? -ne 0 ] ; then
3232
echo "berks command not found. Is ChefDK installed?"
3333
echo "Please visit https://downloads.chef.io/chefdk/ for instruction on how to download and install"
@@ -160,6 +160,9 @@ do_command() {
160160
export BUILD_DATE=${_build_date}
161161
fi
162162

163+
# set it to try for 1 hour, this is to resolve ami copy timeout issue
164+
# https://github.com/hashicorp/packer/issues/6536
165+
export AWS_TIMEOUT_SECONDS=3600
163166

164167
case ${_os} in
165168
all)
@@ -190,4 +193,4 @@ main() {
190193
do_command
191194
}
192195

193-
main "$@"
196+
main "$@"

amis/packer_variables.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"parallelcluster_version": "2.4.0",
3-
"parallelcluster_cookbook_version": "2.4.0",
2+
"parallelcluster_version": "2.4.1",
3+
"parallelcluster_cookbook_version": "2.4.1",
44
"chef_version": "14.2.0",
55
"ridley_version": "5.1.1",
66
"berkshelf_version": "7.0.4"

attributes/default.rb

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,30 @@
1818
default['cfncluster']['sources_dir'] = "#{node['cfncluster']['base_dir']}/sources"
1919
default['cfncluster']['scripts_dir'] = "#{node['cfncluster']['base_dir']}/scripts"
2020
default['cfncluster']['license_dir'] = "#{node['cfncluster']['base_dir']}/licenses"
21+
# Python Version
22+
default['cfncluster']['python-version'] = '3.6.9'
23+
# Virtualenv Cookbook Name
24+
default['cfncluster']['cookbook_virtualenv'] = 'cookbook_virtualenv'
25+
# Virtualenv Node Name
26+
default['cfncluster']['node_virtualenv'] = 'node_virtualenv'
27+
# Cookbook Virtualenv Path
28+
default['cfncluster']['cookbook_virtualenv_path'] = "/root/.pyenv/versions/#{node['cfncluster']['python-version']}/envs/#{node['cfncluster']['cookbook_virtualenv']}"
29+
# Node Virtualenv Path
30+
default['cfncluster']['node_virtualenv_path'] = "/root/.pyenv/versions/#{node['cfncluster']['python-version']}/envs/#{node['cfncluster']['node_virtualenv']}"
31+
# Intel MPI
32+
default['cfncluster']['intelmpi']['url'] = "http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/15553/aws_impi.sh"
33+
default['cfncluster']['intelmpi']['version'] = '2019.4.243'
34+
default['cfncluster']['intelmpi']['modulefile'] = "/opt/intel/impi/#{node['cfncluster']['intelmpi']['version']}/intel64/modulefiles/mpi"
2135
# Python packages
22-
default['cfncluster']['cfncluster-version'] = '2.4.0'
23-
default['cfncluster']['cfncluster-node-version'] = '2.4.0'
24-
default['cfncluster']['supervisor-version'] = '3.4.0'
36+
default['cfncluster']['cfncluster-version'] = '2.4.1'
37+
default['cfncluster']['cfncluster-node-version'] = '2.4.1'
2538
# URLs to software packages used during install recipes
2639
# Gridengine software
2740
default['cfncluster']['sge']['version'] = '8.1.9'
2841
default['cfncluster']['sge']['url'] = 'https://arc.liv.ac.uk/downloads/SGE/releases/8.1.9/sge-8.1.9.tar.gz'
2942
# Torque software
30-
default['cfncluster']['torque']['version'] = '6.0.2'
31-
default['cfncluster']['torque']['url'] = 'https://github.com/adaptivecomputing/torque/archive/6.0.2.tar.gz'
43+
default['cfncluster']['torque']['version'] = '6.1.2'
44+
default['cfncluster']['torque']['url'] = 'https://github.com/adaptivecomputing/torque/archive/6.1.2.tar.gz'
3245
# Slurm software
3346
default['cfncluster']['slurm']['version'] = '18-08-6-2'
3447
default['cfncluster']['slurm']['url'] = 'https://github.com/SchedMD/slurm/archive/slurm-18-08-6-2.tar.gz'
@@ -62,20 +75,27 @@
6275
default['openssh']['server']['subsystem'] = 'sftp /usr/libexec/openssh/sftp-server'
6376
default['openssh']['client']['gssapi_authentication'] = 'yes'
6477

78+
# ulimit settings
79+
default['cfncluster']['filehandle_limit'] = 10000
80+
default['cfncluster']['memory_limit'] = 'unlimited'
81+
6582
# Platform defaults
6683
case node['platform_family']
6784
when 'rhel', 'amazon'
6885

6986
default['cfncluster']['kernel_devel_pkg']['name'] = "kernel-devel"
7087
default['cfncluster']['kernel_devel_pkg']['version'] = node['kernel']['release'].chomp('.x86_64')
7188

89+
# Modulefile Directory
90+
default['cfncluster']['modulefile_dir'] = "/usr/share/Modules/modulefiles"
91+
7292
case node['platform']
7393
when 'centos', 'redhat', 'scientific' # ~FC024
7494
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel
7595
libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool
7696
httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel
7797
blas-devel fftw-devel libffi-devel openssl-devel dkms mysql-devel libedit-devel
78-
libical-devel postgresql-devel postgresql-server sendmail mdadm]
98+
libical-devel postgresql-devel postgresql-server sendmail mdadm python python-pip]
7999

80100
# Lustre Drivers for Centos 6
81101
default['cfncluster']['lustre']['version'] = '2.10.6'
@@ -87,7 +107,7 @@
87107
libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool
88108
httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel
89109
blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel
90-
libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm]
110+
libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm python python-pip]
91111
if node['platform_version'].split('.')[1] == '6'
92112
# Lustre Drivers for Centos 7.6
93113
default['cfncluster']['lustre']['version'] = '2.10.6'
@@ -108,7 +128,7 @@
108128
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel
109129
libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool
110130
httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel
111-
libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server
131+
libffi-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server
112132
sendmail cmake byacc libglvnd-devel mdadm]
113133
end
114134

@@ -125,11 +145,13 @@
125145
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh libssl-dev ncurses-dev libpam-dev net-tools libhwloc-dev dkms
126146
tcl-dev automake autoconf python-parted libtool librrd-dev libapr1-dev libconfuse-dev
127147
apache2 libboost-dev libdb-dev tcsh libssl-dev libncurses5-dev libpam0g-dev libxt-dev
128-
libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev
148+
libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev python python-pip
129149
r-base libatlas-dev libblas-dev libfftw3-dev libffi-dev libssl-dev libxml2-dev mdadm]
130150
if node['platform_version'] == '14.04'
131151
default['cfncluster']['base_packages'].push('libopenmpi-dev')
132152
end
153+
# Modulefile Directory
154+
default['cfncluster']['modulefile_dir'] = "/usr/share/modules/modulefiles"
133155
default['cfncluster']['kernel_generic_pkg'] = "linux-generic"
134156
default['cfncluster']['kernel_extra_pkg'] = "linux-image-extra-#{node['kernel']['release']}"
135157
default['cfncluster']['ganglia']['apache_user'] = 'www-data'

files/default/attachVolume.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
#!/usr/bin/env python
2-
31
import sys
4-
import parted
2+
import subprocess
53
import os
6-
import urllib2
4+
import requests
75
import boto3
86
import time
9-
import ConfigParser
7+
import configparser
108
from botocore.config import Config
119

1210

@@ -23,24 +21,36 @@ def convert_dev(dev):
2321
else:
2422
return dev
2523

24+
def get_all_devices():
25+
# lsblk -d -n
26+
# xvda 202:0 0 17G 0 disk
27+
# xvdb 202:16 0 20G 0 disk /shared
28+
command = ["/bin/lsblk", "-d", "-n"]
29+
30+
try:
31+
output = subprocess.check_output(command, stderr=subprocess.STDOUT, universal_newlines=True).split("\n")
32+
return ["/dev/{}".format(line.split()[0]) for line in output if len(line.split()) > 0]
33+
except subprocess.CalledProcessError as e:
34+
print("Failed to get devices with lsblk -d -n")
35+
raise e
2636

2737
def main():
2838
# Get EBS volume Id
2939
try:
3040
volumeId = str(sys.argv[1])
3141
except IndexError:
32-
print "Provide an EBS volume ID to attach i.e. vol-cc789ea5"
42+
print("Provide an EBS volume ID to attach i.e. vol-cc789ea5")
3343
sys.exit(1)
3444

3545
# Get instance ID
36-
instanceId = urllib2.urlopen("http://169.254.169.254/latest/meta-data/instance-id").read()
46+
instanceId = requests.get("http://169.254.169.254/latest/meta-data/instance-id").text
3747

3848
# Get region
39-
region = urllib2.urlopen("http://169.254.169.254/latest/meta-data/placement/availability-zone").read()
49+
region = requests.get("http://169.254.169.254/latest/meta-data/placement/availability-zone").text
4050
region = region[:-1]
4151

4252
# Generate a list of system paths minus the root path
43-
paths = [convert_dev(device.path) for device in parted.getAllDevices()]
53+
paths = [convert_dev(device) for device in get_all_devices()]
4454

4555
# List of possible block devices
4656
blockDevices = ['/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde', '/dev/sdf', '/dev/sdg', '/dev/sdh',
@@ -52,7 +62,7 @@ def main():
5262
availableDevices = [a for a in blockDevices if a not in paths]
5363

5464
# Parse configuration file to read proxy settings
55-
config = ConfigParser.RawConfigParser()
65+
config = configparser.RawConfigParser()
5666
config.read('/etc/boto.cfg')
5767
proxy_config = Config()
5868
if config.has_option('Boto', 'proxy') and config.has_option('Boto', 'proxy_port'):
@@ -72,12 +82,12 @@ def main():
7282
x = 0
7383
while state != "attached":
7484
if x == 36:
75-
print "Volume %s failed to mount in 180 seconds." % volumeId
85+
print("Volume %s failed to mount in 180 seconds." % volumeId)
7686
exit(1)
7787
if state in ["busy" or "detached"]:
78-
print "Volume %s in bad state %s" % (volumeId, state)
88+
print("Volume %s in bad state %s" % (volumeId, state))
7989
exit(1)
80-
print "Volume %s in state %s ... waiting to be 'attached'" % (volumeId, state)
90+
print("Volume %s in state %s ... waiting to be 'attached'" % (volumeId, state))
8191
time.sleep(5)
8292
x += 1
8393
try:

files/default/compute_ready

Lines changed: 0 additions & 11 deletions
This file was deleted.

files/default/ec2-volid.rules

Lines changed: 0 additions & 4 deletions
This file was deleted.

files/default/ec2_dev_2_volid.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
#!/usr/bin/env python
2-
3-
import urllib2
1+
import requests
42
import sys
53
import os
64
import syslog
75
import time
86
import boto3
9-
import ConfigParser
7+
import configparser
108
from botocore.config import Config
119

1210

@@ -31,14 +29,14 @@ def main():
3129
dev = '/dev/' + dev
3230

3331
# Get instance ID
34-
instanceId = urllib2.urlopen("http://169.254.169.254/latest/meta-data/instance-id").read()
32+
instanceId = requests.get("http://169.254.169.254/latest/meta-data/instance-id").text
3533

3634
# Get region
37-
region = urllib2.urlopen("http://169.254.169.254/latest/meta-data/placement/availability-zone").read()
35+
region = requests.get("http://169.254.169.254/latest/meta-data/placement/availability-zone").text
3836
region = region[:-1]
3937

4038
# Parse configuration file to read proxy settings
41-
config = ConfigParser.RawConfigParser()
39+
config = configparser.RawConfigParser()
4240
config.read('/etc/boto.cfg')
4341
proxy_config = Config()
4442
if config.has_option('Boto', 'proxy') and config.has_option('Boto', 'proxy_port'):
@@ -53,7 +51,7 @@ def main():
5351
devices = ec2.describe_instance_attribute(InstanceId=instanceId, Attribute='blockDeviceMapping').get('BlockDeviceMappings')
5452
devmap = dict((d.get('DeviceName'), d) for d in devices)
5553
x = 0
56-
while not devmap.has_key(dev):
54+
while dev not in devmap:
5755
if x == 36:
5856
syslog.syslog("Dev %s did not appears in 180 seconds." % dev)
5957
sys.exit(1)

files/default/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
awscli
2+
boto3
3+
supervisor
4+
requests

files/default/torque.csh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
set path = (/opt/torque/bin /opt/torque/sbin $path)

0 commit comments

Comments
 (0)