making pyspark an optional dependency

SuperCowPowers · May 12, 2021 · 2924f5a · 2924f5a
1 parent 6271847
commit 2924f5a
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,9 @@ with Pandas, scikit-learn, Kafka, and Spark
 
 ### Install
 ```
-$ pip install zat
+pip install zat
+pip install zat[pyspark] (includes pyspark library)
+pip install zat[all] (include pyarrow, yara-python, and tldextract)
 ```
 
 ### Getting Started
@@ -59,6 +61,11 @@ from here to there.
 ### Documentation
 <https://supercowpowers.github.io/zat/>
 
+#### Running the Tests
+```
+pip install pytest coverage pytest-cov
+pytest zat
+```
 
 ### About SuperCowPowers
 The company was formed so that its developers could follow their passion for Python, streaming data pipelines and having fun with data analysis. We also think cows are cool and should be superheros or at least carry around rayguns and burner phones. <a href="https://www.supercowpowers.com" target="_blank">Visit SuperCowPowers</a>
diff --git a/examples/kafka_spark.py b/examples/kafka_spark.py
@@ -1,18 +1,17 @@
 """Read Kafka Streams into Spark, perform simple filtering/aggregation"""
 
 import sys
-import pyspark
-from pyspark.sql import SparkSession
-from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType
-from pyspark.sql.functions import from_json, to_json, col, struct, udf
-
 import argparse
 from time import sleep
 
-# Local imports
-from zat.utils import signal_utils
+try:
+    from pyspark.sql import SparkSession
+    from pyspark.sql.types import StructType, StringType, BooleanType, IntegerType
+    from pyspark.sql.functions import from_json, to_json, col, struct, udf
+except ImportError:
+    print('\npip install pyspark')
+    sys.exit(1)
 
-# Third Party Imports
 try:
     import tldextract
 except ImportError:

diff --git a/examples/zeek_to_parquet_with_spark.py b/examples/zeek_to_parquet_with_spark.py
@@ -3,7 +3,11 @@
 import os
 import sys
 import argparse
-from pyspark.sql import SparkSession
+try:
+    from pyspark.sql import SparkSession
+except ImportError:
+    print('pip install pyspark')
+    sys.exit(1)
 
 # Local imports
 from zat import log_to_sparkdf

diff --git a/setup.py b/setup.py
@@ -36,14 +36,12 @@ def get_files(dir_name):
     install_requires=[
         'requests',
         'watchdog',
-        'numpy',
-        'scipy',
         'pandas',
-        'scikit-learn',
-        'pyspark'
+        'scikit-learn'
     ],
     extras_require={
-        'all':  ['pyarrow', 'yara-python', 'tldextract']
+        'pyspark':  ['pyspark'],
+        'all':  ['pyspark', 'pyarrow', 'yara-python', 'tldextract']
     },
     license='Apache',
     keywords='Zeek, Bro, Python, Networking, Security, Scikit-Learn, Spark, Kafka, Parquet',

diff --git a/zat/__init__.py b/zat/__init__.py
@@ -1,3 +1,3 @@
 __author__ = 'Brian Wylie'
 __email__ = '[email protected]'
-__version__ = '0.4.2'
+__version__ = '0.4.3'
diff --git a/zat/log_to_sparkdf.py b/zat/log_to_sparkdf.py
@@ -1,9 +1,14 @@
 """LogToSparkDF: Converts a Zeek log to a Spark DataFrame"""
 
+import sys
 
 # Third Party
-from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType
-from pyspark.sql.functions import col, when
+try:
+    from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, LongType
+    from pyspark.sql.functions import col, when
+except ImportError:
+    print('\npip install pyspark')
+
 
 # Local
 from zat import zeek_log_reader
@@ -115,8 +120,13 @@ def build_spark_schema(self, column_names, column_types, verbose=False):
 def test():
     """Test for LogToSparkDF Class"""
     import os
+    import pytest
     from zat.utils import file_utils
-    from pyspark.sql import SparkSession
+
+    try:
+        from pyspark.sql import SparkSession
+    except ImportError:
+        pytest.skip('pip install pyspark')
 
     # Spin up a local Spark Session (with 4 executors)
     spark = SparkSession.builder.master('local[4]').appName('my_awesome').getOrCreate()