Merge pull request #5 from IBMStreams/develop

markheger · web-flow · commit 112524db265e · 2019-09-03T15:51:36.000+02:00
1.1.0
diff --git a/.gitignore b/.gitignore
@@ -108,3 +108,6 @@ venv.bak/
 # mypy
 .mypy_cache/
 /.project
+
+.DS_Store
+/**/.DS_Store
diff --git a/build.xml b/build.xml
@@ -27,7 +27,7 @@
     </target>
 
    <target name="test" depends="clean">
-    <exec executable="/bin/sh" 
+    <exec executable="/bin/sh"
       outputproperty="toolkit.test.output" errorproperty="toolkit.test.error" resultproperty="toolkit.test.result"
       dir="${package}">
       <arg value="-c"/>
@@ -45,7 +45,7 @@
    </target>
 
    <target name="test-sas" depends="clean">
-    <exec executable="/bin/sh" 
+    <exec executable="/bin/sh"
       outputproperty="toolkit.test.output" errorproperty="toolkit.test.error" resultproperty="toolkit.test.result"
       dir="${package}">
       <arg value="-c"/>
@@ -61,9 +61,9 @@
       </condition>
     </fail>
    </target>
-	
+
    <target name="test-sas-remote" depends="clean">
-    <exec executable="/bin/sh" 
+    <exec executable="/bin/sh"
       outputproperty="toolkit.test.output" errorproperty="toolkit.test.error" resultproperty="toolkit.test.result"
       dir="${package}">
       <arg value="-c"/>
@@ -80,4 +80,22 @@
     </fail>
    </target>
 
+   <target name="test-icp-remote" depends="clean">
+    <exec executable="/bin/sh"
+      outputproperty="toolkit.test.output" errorproperty="toolkit.test.error" resultproperty="toolkit.test.result"
+      dir="${package}">
+      <arg value="-c"/>
+      <arg value="unset STREAMS_INSTALL;python3 -u -m unittest streamsx.hdfs.tests.test_hdfs.TestICPRemote.test_close_on_tuples streamsx.hdfs.tests.test_hdfs.TestICPRemote.test_hdfs_uri"/>
+    </exec>
+    <echo message="${toolkit.test.output}" if:set="toolkit.test.output"/>
+    <echo message="${toolkit.test.error}" if:set="toolkit.test.error"/>
+    <fail message="The test failed - result ${toolkit.test.result}.">
+      <condition>
+        <not>
+          <equals arg1="${toolkit.test.result}" arg2="0"/>
+        </not>
+      </condition>
+    </fail>
+   </target>
+
 </project>
diff --git a/package/DESC.txt b/package/DESC.txt
@@ -4,7 +4,7 @@ Overview
 Provides functions to access files on HDFS. For example, connect to IBM Analytics Engine on IBM Cloud.
 
 This package exposes the `com.ibm.streamsx.hdfs <https://ibmstreams.github.io/streamsx.hdfs/>`_ toolkit as Python methods for use with Streaming Analytics service on
-IBM Cloud and IBM Streams including IBM Cloud Private for Data.
+IBM Cloud and IBM Streams including IBM Cloud Pak for Data.
 
 * `Streaming Analytics service <https://console.ng.bluemix.net/catalog/services/streaming-analytics>`_
 * `IBM Streams developer community <https://developer.ibm.com/streamsdev/>`_
@@ -41,7 +41,7 @@ a file to HDFS. Scan for created file on HDFS and read the content::
     r.print()
 
     submit('STREAMING_ANALYTICS_SERVICE', topo)
-    # Use for IBM Streams including IBM Cloud Private for Data
+    # Use for IBM Streams including IBM Cloud Pak for Data
     # submit ('DISTRIBUTED', topo)
 
 
diff --git a/package/docs/source/conf.py b/package/docs/source/conf.py
@@ -65,9 +65,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.0'
+version = '1.1'
 # The full version, including alpha/beta/rc tags.
-release = '1.0.1'
+release = '1.1.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/package/docs/source/index.rst b/package/docs/source/index.rst
@@ -5,7 +5,7 @@ IBM Streams HDFS integration
 ============================
 
 For details of implementing applications in Python
-for IBM Streams including IBM Cloud Private for Data and the Streaming Analytics service
+for IBM Streams including IBM Cloud Pak for Data and the Streaming Analytics service
 running on IBM Cloud see:
 
   * `streamsx package documentation <https://streamsxtopology.readthedocs.io/en/stable>`_
diff --git a/package/setup.py b/package/setup.py
@@ -19,7 +19,7 @@
     'Programming Language :: Python :: 3.5',
     'Programming Language :: Python :: 3.6',
   ],
-  install_requires=['streamsx'],
+  install_requires=['streamsx', 'streamsx.toolkits'],
   
   test_suite='nose.collector',
   tests_require=['nose']
diff --git a/package/streamsx/hdfs/__init__.py b/package/streamsx/hdfs/__init__.py
@@ -22,13 +22,9 @@
 The mandatory JSON elements are "user", "password" and "webhdfs"::
 
     {
-        "cluster": {
-            "password": "<PASSWORD>",
-            "service_endpoints": {
-                "webhdfs": "https://<HOST>:<PORT>/gateway/default/webhdfs/v1/"
-            },
-            "user": "<USER>"
-        },
+        "user": "<USER>"
+        "password": "<PASSWORD>",
+        "webhdfs": "https://<HOST>:<PORT>"
     }
 
 If you are using HDFS server(s) different to the "Analytics Engine" service, 
@@ -64,12 +60,12 @@
     r.print()
 
     submit('STREAMING_ANALYTICS_SERVICE', topo)
-    # Use for IBM Streams including IBM Cloud Private for Data
+    # Use for IBM Streams including IBM Cloud Pak for Data
     # submit ('DISTRIBUTED', topo)
 
 """
 
-__version__='1.0.1'
+__version__='1.1.0'
 
-__all__ = ['scan', 'read', 'write']
-from streamsx.hdfs._hdfs import scan, read, write
+__all__ = ['download_toolkit', 'configure_connection', 'scan', 'read', 'write']
+from streamsx.hdfs._hdfs import download_toolkit, configure_connection, scan, read, write
diff --git a/package/streamsx/hdfs/_hdfs.py b/package/streamsx/hdfs/_hdfs.py
@@ -4,20 +4,30 @@
 
 import datetime
 import os
+import json
 from tempfile import gettempdir
 import streamsx.spl.op
 import streamsx.spl.types
 from streamsx.topology.schema import CommonSchema, StreamSchema
 from streamsx.spl.types import rstring
 from urllib.parse import urlparse
+from streamsx.toolkits import download_toolkit
 
+_TOOLKIT_NAME = 'com.ibm.streamsx.hdfs'
 
 FileInfoSchema = StreamSchema('tuple<rstring fileName, uint64 fileSize>')
 """Structured schema of the file write response tuple. This schema is the output schema of the write method.
 
 ``'tuple<rstring fileName, uint64 fileSize>'``
 """
 
+
+def _add_toolkit_dependency(topo, version):
+    # IMPORTANT: Dependency of this python wrapper to a specific toolkit version
+    # This is important when toolkit is not set with streamsx.spl.toolkit.add_toolkit (selecting toolkit from remote build service)
+    streamsx.spl.toolkit.add_toolkit_dependency(topo, _TOOLKIT_NAME, version)
+
+
 def _read_ae_service_credentials(credentials):
     hdfs_uri = ""
     user = ""
@@ -29,7 +39,12 @@ def _read_ae_service_credentials(credentials):
             password = credentials.get('cluster').get('password')
             hdfs_uri = credentials.get('cluster').get('service_endpoints').get('webhdfs')
         else:
-            raise ValueError(credentials)
+            if 'webhdfs' in credentials:
+                user = credentials.get('user')
+                password = credentials.get('password')
+                hdfs_uri = credentials.get('webhdfs')
+            else:
+                raise ValueError(credentials)
     else:
         raise TypeError(credentials)
     # construct expected format for hdfs_uri: webhdfs://host:port
@@ -48,6 +63,88 @@ def _check_time_param(time_value, parameter_name):
         raise ValueError("Invalid "+parameter_name+" value. Value must be at least one second.")
     return result
 
+def configure_connection (instance, name = 'hdfs', credentials = None):
+    """Configures IBM Streams for a certain connection.
+
+
+    Creates or updates an application configuration object containing the required properties with connection information.
+
+
+    Example for creating a configuration for a Streams instance with connection details::
+
+        from streamsx.rest import Instance
+        import streamsx.topology.context
+        from icpd_core import icpd_util
+        import streamsx.hdfs as hdfs
+        
+        cfg = icpd_util.get_service_instance_details (name='your-streams-instance')
+        cfg[context.ConfigParams.SSL_VERIFY] = False
+        instance = Instance.of_service (cfg)
+        app_cfg = hdfs.configure_connection (instance, credentials = 'my_credentials_json')
+
+    Args:
+        instance(streamsx.rest_primitives.Instance): IBM Streams instance object.
+        name(str): Name of the application configuration, default name is 'hdfs'.
+        credentials(str|dict): The service credentials, for example Analytics Engine service credentials.
+    Returns:
+        Name of the application configuration.
+    """
+
+    description = 'HDFS credentials'
+    properties = {}
+    if credentials is None:
+        raise TypeError (credentials)
+    
+    if isinstance (credentials, dict):
+        properties ['credentials'] = json.dumps (credentials)
+    else:
+        properties ['credentials'] = credentials
+    
+    # check if application configuration exists
+    app_config = instance.get_application_configurations (name = name)
+    if app_config:
+        print ('update application configuration: ' + name)
+        app_config[0].update (properties)
+    else:
+        print ('create application configuration: ' + name)
+        instance.create_application_configuration (name, properties, description)
+    return name
+
+
+def download_toolkit(url=None, target_dir=None):
+    r"""Downloads the latest HDFS toolkit from GitHub.
+
+    Example for updating the HDFS toolkit for your topology with the latest toolkit from GitHub::
+
+        import streamsx.hdfs as hdfs
+        # download HDFS toolkit from GitHub
+        hdfs_toolkit_location = hdfs.download_toolkit()
+        # add the toolkit to topology
+        streamsx.spl.toolkit.add_toolkit(topology, hdfs_toolkit_location)
+
+    Example for updating the topology with a specific version of the HDFS toolkit using a URL::
+
+        import streamsx.hdfs as hdfs
+        url500 = 'https://github.com/IBMStreams/streamsx.hdfs/releases/download/v5.0.0/streamx.hdfs.toolkits-5.0.0-20190902-1637.tgz'
+        hdfs_toolkit_location = hdfs.download_toolkit(url=url500)
+        streamsx.spl.toolkit.add_toolkit(topology, hdfs_toolkit_location)
+
+    Args:
+        url(str): Link to toolkit archive (\*.tgz) to be downloaded. Use this parameter to 
+            download a specific version of the toolkit.
+        target_dir(str): the directory where the toolkit is unpacked to. If a relative path is given,
+            the path is appended to the system temporary directory, for example to /tmp on Unix/Linux systems.
+            If target_dir is ``None`` a location relative to the system temporary directory is chosen.
+
+    Returns:
+        str: the location of the downloaded HDFS toolkit
+
+    .. note:: This function requires an outgoing Internet connection
+    .. versionadded:: 1.1
+    """
+    _toolkit_location = streamsx.toolkits.download_toolkit (toolkit_name=_TOOLKIT_NAME, url=url, target_dir=target_dir)
+    return _toolkit_location
+
 
 def scan(topology, credentials, directory, pattern=None, init_delay=None, schema=CommonSchema.String, name=None):
     """Scans a Hadoop Distributed File System directory for new or modified files.
diff --git a/package/streamsx/hdfs/tests/test_hdfs.py b/package/streamsx/hdfs/tests/test_hdfs.py
@@ -25,31 +25,31 @@ def toolkit_env_var():
     result = True
     try:
         os.environ['STREAMS_HDFS_TOOLKIT']
-    except KeyError: 
+    except KeyError:
         result = False
     return result
 
 def streams_install_env_var():
     result = True
     try:
         os.environ['STREAMS_INSTALL']
-    except KeyError: 
+    except KeyError:
         result = False
     return result
 
 def site_xml_env_var():
     result = True
     try:
         os.environ['HDFS_SITE_XML']
-    except KeyError: 
+    except KeyError:
         result = False
     return result
 
 def cloud_creds_env_var():
     result = True
     try:
         os.environ['ANALYTICS_ENGINE']
-    except KeyError: 
+    except KeyError:
         result = False
     return result
 
@@ -114,7 +114,7 @@ def setUpClass(self):
     def setUp(self):
         Tester.setup_distributed(self)
         self.hdfs_toolkit_location = os.environ['STREAMS_HDFS_TOOLKIT']
- 
+
      # ------------------------------------
     @unittest.skipIf(site_xml_env_var() == False, "HDFS_SITE_XML environment variable.")
     def test_hdfs_config_path(self):
@@ -142,7 +142,7 @@ def test_hdfs_config_path(self):
         cfg = {}
         job_config = streamsx.topology.context.JobConfig(tracing='info')
         job_config.add(cfg)
-        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False     
+        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False
 
         # Run the test
         tester.test(self.test_ctxtype, cfg, always_collect_logs=True)
@@ -176,7 +176,7 @@ def test_hdfs_uri(self):
         cfg = {}
         job_config = streamsx.topology.context.JobConfig(tracing='info')
         job_config.add(cfg)
-        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False     
+        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False
 
         # Run the test
         tester.test(self.test_ctxtype, cfg, always_collect_logs=True)
@@ -205,7 +205,7 @@ def test_close_on_tuples(self):
         cfg = {}
         job_config = streamsx.topology.context.JobConfig(tracing='info')
         job_config.add(cfg)
-        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False     
+        cfg[streamsx.topology.context.ConfigParams.SSL_VERIFY] = False
 
         # Run the test
         tester.test(self.test_ctxtype, cfg, always_collect_logs=True)
@@ -239,3 +239,14 @@ def setUp(self):
         Tester.setup_streaming_analytics(self, force_remote_build=True)
         self.hdfs_toolkit_location = None
 
+
+class TestICPRemote(TestDistributed):
+    """ Test in Cloud Pak using remote toolkit from cloud build service """
+
+    @classmethod
+    def setUpClass(self):
+        super().setUpClass()
+
+    def setUp(self):
+        Tester.setup_distributed(self)
+        self.hdfs_toolkit_location = None
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 streamsx
+streamsx.toolkits