Skip to content

Commit

Permalink
Bumping version to 0.0.13
Browse files Browse the repository at this point in the history
  • Loading branch information
igorborgest committed Oct 26, 2019
1 parent d328e14 commit 5331776
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 146 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

> Utility belt to handle data on AWS.
[![Release](https://img.shields.io/badge/release-0.0.12-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Release](https://img.shields.io/badge/release-0.0.13-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__title__ = "awswrangler"
__description__ = "Utility belt to handle data on AWS."
__version__ = "0.0.12"
__version__ = "0.0.13"
__license__ = "Apache License 2.0"
196 changes: 134 additions & 62 deletions awswrangler/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Module to handle all utilities related to EMR (Elastic Map Reduce)
https://aws.amazon.com/emr/
"""
from typing import Optional, List, Dict
from typing import Optional, List, Dict, Any, Union, Collection
import logging
import json

Expand All @@ -29,8 +29,8 @@ def _build_cluster_args(**pars):
"JobFlowRole": pars["emr_ec2_role"],
"ServiceRole": pars["emr_role"],
"Instances": {
"KeepJobFlowAliveWhenNoSteps": True,
"TerminationProtected": False,
"KeepJobFlowAliveWhenNoSteps": pars["keep_cluster_alive_when_no_steps"],
"TerminationProtected": pars["termination_protected"],
"Ec2SubnetId": pars["subnet_id"],
"InstanceFleets": []
}
Expand All @@ -53,47 +53,68 @@ def _build_cluster_args(**pars):
args["Instances"]["ServiceAccessSecurityGroup"] = pars["security_group_service_access"]

# Configurations
if pars["python3"] or pars["spark_glue_catalog"] or pars["hive_glue_catalog"] or pars["presto_glue_catalog"]:
args["Configurations"]: List = []
if pars["python3"]:
args["Configurations"].append({
"Classification":
"spark-env",
"Properties": {},
"Configurations": [{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "/usr/bin/python3"
},
"Configurations": []
}]
})
if pars["spark_glue_catalog"]:
args["Configurations"].append({
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
},
"Configurations": []
})
if pars["hive_glue_catalog"]:
args["Configurations"].append({
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
},
"Configurations": []
})
if pars["presto_glue_catalog"]:
args["Configurations"].append({
"Classification": "presto-connector-hive",
args["Configurations"]: List[Dict[str, Any]] = [{
"Classification": "spark-log4j",
"Properties": {
"log4j.rootCategory": f"{pars['spark_log_level']}, console"
}
}]
if pars["python3"]:
args["Configurations"].append({
"Classification":
"spark-env",
"Properties": {},
"Configurations": [{
"Classification": "export",
"Properties": {
"hive.metastore.glue.datacatalog.enabled": "true"
"PYSPARK_PYTHON": "/usr/bin/python3"
},
"Configurations": []
})
}]
})
if pars["spark_glue_catalog"]:
args["Configurations"].append({
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
},
"Configurations": []
})
if pars["hive_glue_catalog"]:
args["Configurations"].append({
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
},
"Configurations": []
})
if pars["presto_glue_catalog"]:
args["Configurations"].append({
"Classification": "presto-connector-hive",
"Properties": {
"hive.metastore.glue.datacatalog.enabled": "true"
},
"Configurations": []
})
if pars["maximize_resource_allocation"]:
args["Configurations"].append({
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "true"
}
})
if (pars["spark_jars_path"] is not None) or (pars["spark_defaults"] is not None):
spark_defaults: Dict[str, Union[str, Dict[str, str]]] = {
"Classification": "spark-defaults",
"Properties": {}
}
if pars["spark_jars_path"] is not None:
spark_defaults["Properties"]["spark.jars"] = pars["spark_jars_path"]
for k, v in pars["spark_defaults"].items():
spark_defaults["Properties"][k] = v
args["Configurations"].append(spark_defaults)

# Applications
if pars["applications"]:
Expand All @@ -108,16 +129,20 @@ def _build_cluster_args(**pars):
}
} for x in pars["bootstraps_paths"]]

# Debugging
if pars["debugging"]:
args["Steps"]: List[Dict] = [{
"Name": "Setup Hadoop Debugging",
"ActionOnFailure": "TERMINATE_CLUSTER",
"HadoopJarStep": {
"Jar": "command-runner.jar",
"Args": ["state-pusher-script"]
}
}]
# Debugging and Steps
if (pars["debugging"] is True) or (pars["steps"] is not None):
args["Steps"]: List[Dict[str, Collection[str]]] = []
if pars["debugging"] is True:
args["Steps"].append({
"Name": "Setup Hadoop Debugging",
"ActionOnFailure": "TERMINATE_CLUSTER",
"HadoopJarStep": {
"Jar": "command-runner.jar",
"Args": ["state-pusher-script"]
}
})
if pars["steps"] is not None:
args["Steps"] += pars["steps"]

# Master Instance Fleet
timeout_action_master: str = "SWITCH_TO_ON_DEMAND" if pars[
Expand Down Expand Up @@ -161,7 +186,8 @@ def _build_cluster_args(**pars):

# Core Instance Fleet
if (pars["instance_num_spot_core"] > 0) or pars["instance_num_on_demand_core"] > 0:
timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars["spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER"
timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars[
"spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER"
fleet_core: Dict = {
"Name":
"CORE",
Expand Down Expand Up @@ -284,7 +310,14 @@ def create_cluster(self,
security_groups_master_additional: Optional[List[str]] = None,
security_group_slave: Optional[str] = None,
security_groups_slave_additional: Optional[List[str]] = None,
security_group_service_access: Optional[str] = None):
security_group_service_access: Optional[str] = None,
spark_log_level: str = "WARN",
spark_jars_path: Optional[str] = None,
spark_defaults: Dict[str, str] = None,
maximize_resource_allocation: bool = False,
steps: Optional[List[Dict[str, Collection[str]]]] = None,
keep_cluster_alive_when_no_steps: bool = True,
termination_protected: bool = False):
"""
Create a EMR cluster with instance fleets configuration
https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-instance-fleet.html
Expand Down Expand Up @@ -329,6 +362,13 @@ def create_cluster(self,
:param security_group_slave: The identifier of the Amazon EC2 security group for the core and task nodes.
:param security_groups_slave_additional: A list of additional Amazon EC2 security group IDs for the core and task nodes.
:param security_group_service_access: The identifier of the Amazon EC2 security group for the Amazon EMR service to access clusters in VPC private subnets.
:param spark_log_level: log4j.rootCategory log level (ALL, DEBUG, INFO, WARN, ERROR, FATAL, OFF, TRACE)
:param spark_jars_path: spark.jars (https://spark.apache.org/docs/latest/configuration.html) (e.g. s3://...)
:param spark_defaults: (https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#spark-defaults)
:param maximize_resource_allocation: Configure your executors to utilize the maximum resources possible (https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#emr-spark-maximizeresourceallocation)
:param steps: Steps definitions (Obs: Use EMR.build_step() to build that)
:param keep_cluster_alive_when_no_steps: Specifies whether the cluster should remain available after completing all steps
:param termination_protected: Specifies whether the Amazon EC2 instances in the cluster are protected from termination by API calls, user intervention, or in the event of a job-flow error.
:return: Cluster ID (string)
"""
args = EMR._build_cluster_args(**locals())
Expand Down Expand Up @@ -358,28 +398,60 @@ def terminate_cluster(self, cluster_id: str) -> None:
])
logger.info(f"response: \n{json.dumps(response, default=str, indent=4)}")

def submit_step(self, cluster_id: str, name: str, cmd: str, action_on_failure: str = "CONTINUE") -> str:
def submit_steps(self, cluster_id: str, steps: List[Dict[str, Collection[str]]]) -> List[str]:
"""
Submit a list of steps
:param cluster_id: EMR Cluster ID
:param steps: Steps definitions (Obs: Use EMR.build_step() to build that)
:return: List of step IDs
"""
response: Dict = self._client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=steps)
logger.info(f"response: \n{json.dumps(response, default=str, indent=4)}")
return response["StepIds"]

def submit_step(self,
cluster_id: str,
name: str,
command: str,
action_on_failure: str = "CONTINUE",
script: bool = False) -> str:
"""
Submit new job in the EMR Cluster
:param cluster_id: EMR Cluster ID
:param name: Step name
:param cmd: Command to be executed
:param command: e.g. 'echo "Hello!"' | e.g. for script 's3://.../script.sh arg1 arg2'
:param action_on_failure: 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
:param script: True for raw command or False for script runner (https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html)
:return: Step ID
"""
region: str = self._session.region_name
logger.info(f"region: {region}")
step = EMR.build_step(self, name=name, command=command, action_on_failure=action_on_failure, script=script)
response: Dict = self._client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
logger.info(f"response: \n{json.dumps(response, default=str, indent=4)}")
return response["StepIds"][0]

def build_step(self, name: str, command: str, action_on_failure: str = "CONTINUE",
script: bool = False) -> Dict[str, Collection[str]]:
"""
Build the Step dictionary
:param name: Step name
:param command: e.g. 'echo "Hello!"' | e.g. for script 's3://.../script.sh arg1 arg2'
:param action_on_failure: 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
:param script: True for raw command or False for script runner (https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-commandrunner.html)
:return: Step Dict
"""
jar: str = "command-runner.jar"
if script is True:
region: str = self._session.region_name
jar = f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar"
step = {
"Name": name,
"ActionOnFailure": action_on_failure,
"HadoopJarStep": {
"Jar": f"s3://{region}.elasticmapreduce/libs/script-runner/script-runner.jar",
"Args": cmd.split(" ")
"Jar": jar,
"Args": command.split(" ")
}
}
response: Dict = self._client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
logger.info(f"response: \n{json.dumps(response, default=str, indent=4)}")
return response["StepIds"][0]
return step

def get_step_state(self, cluster_id: str, step_id: str) -> str:
"""
Expand Down
4 changes: 2 additions & 2 deletions testing/run-tests.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/#!/usr/bin/env bash
#!/usr/bin/env bash
set -e

cd ..
pip install -e .
pip install --upgrade -e .
yapf --in-place --recursive setup.py awswrangler testing/test_awswrangler
mypy awswrangler
flake8 setup.py awswrangler testing/test_awswrangler
Expand Down
Loading

0 comments on commit 5331776

Please sign in to comment.