Skip to content

Commit a3432a1

Browse files
authored
fix: added cli functionality to dataproc quickstart example (GoogleCloudPlatform#2734)
* Added CLI functionality to quickstart
1 parent 1425a54 commit a3432a1

File tree

2 files changed

+58
-22
lines changed

2 files changed

+58
-22
lines changed

dataproc/quickstart/quickstart.py

+44-13
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,25 @@
1515
# limitations under the License.
1616

1717
# [START dataproc_quickstart]
18+
"""
19+
This quickstart sample walks a user through creating a Cloud Dataproc
20+
cluster, submitting a PySpark job from Google Cloud Storage to the
21+
cluster, reading the output of the job and deleting the cluster, all
22+
using the Python client library.
23+
24+
Usage:
25+
python quickstart.py --project_id <PROJECT_ID> --region <REGION> \
26+
--cluster_name <CLUSTER_NAME> --job_file_path <GCS_JOB_FILE_PATH>
27+
"""
28+
29+
import argparse
1830
import time
1931

2032
from google.cloud import dataproc_v1 as dataproc
2133
from google.cloud import storage
2234

2335

2436
def quickstart(project_id, region, cluster_name, job_file_path):
25-
"""This quickstart sample walks a user through creating a Cloud Dataproc
26-
cluster, submitting a PySpark job from Google Cloud Storage to the
27-
cluster, reading the output of the job and deleting the cluster, all
28-
using the Python client library.
29-
30-
Args:
31-
project_id (string): Project to use for creating resources.
32-
region (string): Region where the resources should live.
33-
cluster_name (string): Name to use for creating a cluster.
34-
job_file_path (string): Job in GCS to execute against the cluster.
35-
"""
36-
3737
# Create the cluster client.
3838
cluster_client = dataproc.ClusterControllerClient(client_options={
3939
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
@@ -125,4 +125,35 @@ def quickstart(project_id, region, cluster_name, job_file_path):
125125
operation.result()
126126

127127
print('Cluster {} successfully deleted.'.format(cluster_name))
128-
# [END dataproc_quickstart]
128+
129+
130+
if __name__ == "__main__":
131+
parser = argparse.ArgumentParser(
132+
description=__doc__,
133+
formatter_class=argparse.RawDescriptionHelpFormatter,
134+
)
135+
parser.add_argument(
136+
'--project_id',
137+
type=str,
138+
required=True,
139+
help='Project to use for creating resources.')
140+
parser.add_argument(
141+
'--region',
142+
type=str,
143+
required=True,
144+
help='Region where the resources should live.')
145+
parser.add_argument(
146+
'--cluster_name',
147+
type=str,
148+
required=True,
149+
help='Name to use for creating a cluster.')
150+
parser.add_argument(
151+
'--job_file_path',
152+
type=str,
153+
required=True,
154+
help='Job in GCS to execute against the cluster.')
155+
156+
args = parser.parse_args()
157+
quickstart(args.project_id, args.region,
158+
args.cluster_name, args.job_file_path)
159+
# [END dataproc_quickstart]

dataproc/quickstart/quickstart_test.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@
1515
import os
1616
import uuid
1717
import pytest
18+
import subprocess
1819

1920
from google.cloud import dataproc_v1 as dataproc
2021
from google.cloud import storage
2122

22-
import quickstart
23-
2423

2524
PROJECT_ID = os.environ['GCLOUD_PROJECT']
2625
REGION = 'us-central1'
@@ -29,10 +28,10 @@
2928
JOB_FILE_NAME = 'sum.py'
3029
JOB_FILE_PATH = 'gs://{}/{}'.format(STAGING_BUCKET, JOB_FILE_NAME)
3130
SORT_CODE = (
32-
"import pyspark\n"
33-
"sc = pyspark.SparkContext()\n"
34-
"rdd = sc.parallelize((1,2,3,4,5))\n"
35-
"sum = rdd.reduce(lambda x, y: x + y)\n"
31+
"import pyspark\n"
32+
"sc = pyspark.SparkContext()\n"
33+
"rdd = sc.parallelize((1,2,3,4,5))\n"
34+
"sum = rdd.reduce(lambda x, y: x + y)\n"
3635
)
3736

3837

@@ -60,10 +59,16 @@ def setup_teardown():
6059
blob.delete()
6160

6261

63-
def test_quickstart(capsys):
64-
quickstart.quickstart(PROJECT_ID, REGION, CLUSTER_NAME, JOB_FILE_PATH)
62+
def test_quickstart():
63+
command = [
64+
'python', 'quickstart/quickstart.py',
65+
'--project_id', PROJECT_ID,
66+
'--region', REGION,
67+
'--cluster_name', CLUSTER_NAME,
68+
'--job_file_path', JOB_FILE_PATH
69+
]
70+
out = subprocess.check_output(command).decode("utf-8")
6571

66-
out, _ = capsys.readouterr()
6772
assert 'Cluster created successfully' in out
6873
assert 'Submitted job' in out
6974
assert 'finished with state DONE:' in out

0 commit comments

Comments
 (0)