Merge pull request #10 from IBMStreams/develop

anouri · web-flow · commit 9391b81a93f3 · 2020-03-26T14:34:46.000+01:00
pypi.streamsx.hdfs:  Merge develop to master branche
diff --git a/README.md b/README.md
@@ -84,6 +84,32 @@ or
     ant test
 
 
+### Composite Test with local Streams instance 
+
+This test requires:
+- The environment variable `STREAMS_INSTALL` set and the Streams instance is running.
+- The environment variable `STREAMS_HDFS_TOOLKIT` set for the `com.ibm.streamsx.hdfs` toolkit location: 
+- A runing Ibm HADOOP cluster with a running HDFS instance.
+- The environment variable `HDFS_SITE_XML` set to the HDFS configuration file `core.site.xml`.
+
+This test copies the HDFS configiration file 'core.site.xml' in the application directory `etc` and perform the following tests:
+
+- The standard operator `Beacon` generates 1000 lines.
+- The `HdfsFileSink` opeartor writes every 100 lines produced by Beacon operator in a new file in 'pytest' (sample41.txt, sample42.txt, ...)
+- The `HdfsDirectoryScan` operator scans the directory `pytest` and delivers HDFS file names in output port.
+- The `HdfsFileSource` operator reads HDFS files in directory `pytest` deliverd by HdfsDirectoryScan and returns the lines of files in output port.
+- The `HdfsDirectoryScan` operator scans the directory `pytest` and delivers HDFS file names in output port.
+- The `HdfsFileCopy` operator copies HDFS files from directory 'pytest' deliverd by HdfsDirectoryScan into local directory `/tmp/`
+
+
+```
+cd package
+python3 -u -m unittest streamsx.hdfs.tests.test_hdfs.TestCompositeDistributed
+```
+
+
+
+
 ### Test with Streaming Analytics Service
 
 This requires Streaming Analytics service and IBM Analytics Engine service in IBM cloud.
@@ -100,6 +126,29 @@ or
     ant test-sas
 
 
+### Composite Test with IBM Analytic Engine
+
+This test requires:
+- The environment variable `STREAMS_INSTALL` set and the Streams instance is running.
+- The environment variable `STREAMS_HDFS_TOOLKIT` set for the `com.ibm.streamsx.hdfs` toolkit location: 
+- A runing Ibm Analytics Engine HADOOP cluster with a running HDFS instance.
+- The environment variable `ANALYTICS_ENGINE` set to credential file that contains the Hadoop cluster webhdfs credentilas as a JSON string.
+
+This test redas the IAE credentilas (HdfsUri, HdfsUser, HdfsPassword) from JSON file and perform the following tests:
+
+- The standard operator `Beacon` generates 1000 lines.
+- The `HdfsFileSink` opeartor writes every 100 lines produced by Beacon operator in a new file in `pytest` directory
+  `sample41.txt, sample42.txt, ...`
+- The `HdfsDirectoryScan` operator scans the directory `pytest` and delivers HDFS file names in output port.
+- The `HdfsFileSource` operator reads HDFS files in directory `pytest` deliverd by HdfsDirectoryScan and returns the lines of files in output port.
+
+
+```
+cd package
+python3 -u -m unittest streamsx.hdfs.tests.test_hdfs.TestCompositeWebHdfs
+```
+
+
 #### Remote build
 
 For using the toolkit from the build service (**force_remote_build**) run the test with:
@@ -115,4 +164,3 @@ cd package
 python3 -u -m unittest streamsx.hdfs.tests.test_hdfs.TestCloudRemote.test_close_on_tuples streamsx.hdfs.tests.test_hdfs.TestCloudRemote.test_hdfs_uri
 ```
 
-
diff --git a/package/DESC.txt b/package/DESC.txt
@@ -1,7 +1,7 @@
 Overview
 ========
 
-Provides functions to access files on HDFS. For example, connect to IBM Analytics Engine on IBM Cloud.
+Provides functions and classes to access files on HDFS. For example, connect to IBM Analytics Engine on IBM Cloud.
 
 This package exposes the `com.ibm.streamsx.hdfs <https://ibmstreams.github.io/streamsx.hdfs/>`_ toolkit as Python methods for use with Streaming Analytics service on
 IBM Cloud and IBM Streams including IBM Cloud Pak for Data.
@@ -10,41 +10,6 @@ IBM Cloud and IBM Streams including IBM Cloud Pak for Data.
 * `IBM Streams developer community <https://developer.ibm.com/streamsdev/>`_
 * `IBM Analytics Engine <https://www.ibm.com/cloud/analytics-engine>`_
 
-
-Sample
-======
-
-A simple hello world example of a Streams application writing string messages to
-a file to HDFS. Scan for created file on HDFS and read the content::
-
-    from streamsx.topology.topology import *
-    from streamsx.topology.schema import CommonSchema, StreamSchema
-    from streamsx.topology.context import submit
-    import streamsx.hdfs as hdfs
-
-    credentials = json.load(credentials_analytics_engine_service)
-
-    topo = Topology('HDFSHelloWorld')
-
-    to_hdfs = topo.source(['Hello', 'World!'])
-    to_hdfs = to_hdfs.as_string()
-   
-    # Write a stream to HDFS
-    hdfs.write(to_hdfs, credentials=credentials, file='/sample/hw.txt')
-
-    scanned = hdfs.scan(topo, credentials=credentials, directory='/sample', init_delay=10)
-    
-    # read text file line by line
-    r = hdfs.read(scanned, credentials=credentials)
-    
-    # print each line (tuple)
-    r.print()
-
-    submit('STREAMING_ANALYTICS_SERVICE', topo)
-    # Use for IBM Streams including IBM Cloud Pak for Data
-    # submit ('DISTRIBUTED', topo)
-
-
 Documentation
 =============
 
diff --git a/package/docs/source/conf.py b/package/docs/source/conf.py
@@ -65,9 +65,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.4'
+version = '1.5'
 # The full version, including alpha/beta/rc tags.
-release = '1.4.0'
+release = '1.5.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/package/docs/source/index.rst b/package/docs/source/index.rst
@@ -1,8 +1,8 @@
 streamsx.hdfs package
 #####################
 
-IBM Streams HDFS integration
-============================
+HDFS integration for IBM Streams
+================================
 
 For details of implementing applications in Python
 for IBM Streams including IBM Cloud Pak for Data and the Streaming Analytics service
diff --git a/package/setup.py b/package/setup.py
@@ -5,7 +5,7 @@
   packages = ['streamsx.hdfs'],
   include_package_data=True,
   version = streamsx.hdfs.__version__,
-  description = 'IBM Streams HDFS integration',
+  description = 'HDFS integration for IBM Streams',
   long_description = open('DESC.txt').read(),
   author = 'IBM Streams @ github.com',
   author_email = 'hegermar@de.ibm.com',
diff --git a/package/streamsx/hdfs/__init__.py b/package/streamsx/hdfs/__init__.py
@@ -65,7 +65,7 @@
 
 """
 
-__version__='1.4.0'
+__version__='1.5.0'
 
-__all__ = ['download_toolkit', 'configure_connection', 'scan', 'read', 'write']
-from streamsx.hdfs._hdfs import download_toolkit, configure_connection, scan, read, write, copy
+__all__ = ['HdfsDirectoryScan', 'HdfsFileSink', 'HdfsFileSource', 'HdfsFileCopy', 'download_toolkit', 'configure_connection', 'scan', 'read', 'write']
+from streamsx.hdfs._hdfs import download_toolkit, configure_connection, scan, read, write, copy, HdfsDirectoryScan, HdfsFileSink, HdfsFileSource, HdfsFileCopy
diff --git a/package/streamsx/hdfs/_hdfs.py b/package/streamsx/hdfs/_hdfs.py
diff --git a/package/streamsx/hdfs/tests/test_hdfs.py b/package/streamsx/hdfs/tests/test_hdfs.py
diff --git a/package/version/.product b/package/version/.product