-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- introduces pandas module - abstracted Reader class to a common BaseReader (as part of models module) - re-implemented spark.readers.Reader based on BaseReader - introduces ExcelReader at koheesio.pandas.readers.excel.ExcelReader - introduces ExcelReader at koheesio.spark.reader.excel.ExcelReader - added unittests to cover the above - added excel extra dependency - added docs --------- Co-authored-by: Danny Meijer <[email protected]>
- Loading branch information
1 parent
d7d3243
commit fc11f0e
Showing
22 changed files
with
416 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
Module for the BaseReader class | ||
""" | ||
|
||
from typing import Optional, TypeVar | ||
from abc import ABC, abstractmethod | ||
|
||
from koheesio import Step | ||
|
||
# Define a type variable that can be any type of DataFrame | ||
DataFrameType = TypeVar("DataFrameType") | ||
|
||
|
||
class BaseReader(Step, ABC): | ||
"""Base class for all Readers | ||
A Reader is a Step that reads data from a source based on the input parameters | ||
and stores the result in self.output.df (DataFrame). | ||
When implementing a Reader, the execute() method should be implemented. | ||
The execute() method should read from the source and store the result in self.output.df. | ||
The Reader class implements a standard read() method that calls the execute() method and returns the result. This | ||
method can be used to read data from a Reader without having to call the execute() method directly. Read method | ||
does not need to be implemented in the child class. | ||
The Reader class also implements a shorthand for accessing the output Dataframe through the df-property. If the | ||
output.df is None, .execute() will be run first. | ||
""" | ||
|
||
@property | ||
def df(self) -> Optional[DataFrameType]: | ||
"""Shorthand for accessing self.output.df | ||
If the output.df is None, .execute() will be run first | ||
""" | ||
if not self.output.df: | ||
self.execute() | ||
return self.output.df | ||
|
||
@abstractmethod | ||
def execute(self) -> Step.Output: | ||
"""Execute on a Reader should handle self.output.df (output) as a minimum | ||
Read from whichever source -> store result in self.output.df | ||
""" | ||
pass | ||
|
||
def read(self) -> DataFrameType: | ||
"""Read from a Reader without having to call the execute() method directly""" | ||
self.execute() | ||
return self.output.df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
"""Base class for a Pandas step | ||
Extends the Step class with Pandas DataFrame support. The following: | ||
- Pandas steps are expected to return a Pandas DataFrame as output. | ||
""" | ||
|
||
from typing import Optional | ||
from abc import ABC | ||
|
||
from koheesio import Step, StepOutput | ||
from koheesio.models import Field | ||
from koheesio.spark.utils import import_pandas_based_on_pyspark_version | ||
|
||
pandas = import_pandas_based_on_pyspark_version() | ||
|
||
|
||
class PandasStep(Step, ABC): | ||
"""Base class for a Pandas step | ||
Extends the Step class with Pandas DataFrame support. The following: | ||
- Pandas steps are expected to return a Pandas DataFrame as output. | ||
""" | ||
|
||
class Output(StepOutput): | ||
"""Output class for PandasStep""" | ||
|
||
df: Optional[pandas.DataFrame] = Field(default=None, description="The Pandas DataFrame") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
""" | ||
Base class for all Readers | ||
""" | ||
|
||
from abc import ABC, abstractmethod | ||
|
||
from koheesio.models.reader import BaseReader | ||
from koheesio.pandas import PandasStep | ||
|
||
|
||
class Reader(BaseReader, PandasStep, ABC): | ||
"""Base class for all Readers | ||
A Reader is a Step that reads data from a source based on the input parameters | ||
and stores the result in self.output.df (DataFrame). | ||
When implementing a Reader, the execute() method should be implemented. | ||
The execute() method should read from the source and store the result in self.output.df. | ||
The Reader class implements a standard read() method that calls the execute() method and returns the result. This | ||
method can be used to read data from a Reader without having to call the execute() method directly. Read method | ||
does not need to be implemented in the child class. | ||
The Reader class also implements a shorthand for accessing the output Dataframe through the df-property. If the | ||
output.df is None, .execute() will be run first. | ||
""" | ||
|
||
@abstractmethod | ||
def execute(self) -> PandasStep.Output: | ||
"""Execute on a Reader should handle self.output.df (output) as a minimum | ||
Read from whichever source -> store result in self.output.df | ||
""" | ||
# self.output.df # output dataframe | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
Excel reader for Spark | ||
Note | ||
---- | ||
Ensure the 'excel' extra is installed before using this reader. | ||
Default implementation uses openpyxl as the engine for reading Excel files. | ||
Other implementations can be used by passing the correct keyword arguments to the reader. | ||
See Also | ||
-------- | ||
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html | ||
- koheesio.pandas.readers.excel.ExcelReader | ||
""" | ||
|
||
from typing import List, Optional, Union | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
from koheesio.models import ExtraParamsMixin, Field | ||
from koheesio.pandas.readers import Reader | ||
|
||
|
||
class ExcelReader(Reader, ExtraParamsMixin): | ||
"""Read data from an Excel file | ||
See Also | ||
-------- | ||
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html | ||
Attributes | ||
---------- | ||
path : Union[str, Path] | ||
The path to the Excel file | ||
sheet_name : str | ||
The name of the sheet to read | ||
header : Optional[Union[int, List[int]]] | ||
Row(s) to use as the column names | ||
Any other keyword arguments will be passed to pd.read_excel. | ||
""" | ||
|
||
path: Union[str, Path] = Field(description="The path to the Excel file") | ||
sheet_name: str = Field(default="Sheet1", description="The name of the sheet to read") | ||
header: Optional[Union[int, List[int]]] = Field(default=0, description="Row(s) to use as the column names") | ||
|
||
def execute(self): | ||
extra_params = self.params or {} | ||
self.output.df = pd.read_excel(self.path, sheet_name=self.sheet_name, header=self.header, **extra_params) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
""" | ||
Excel reader for Spark | ||
Note | ||
---- | ||
Ensure the 'excel' extra is installed before using this reader. | ||
Default implementation uses openpyxl as the engine for reading Excel files. | ||
Other implementations can be used by passing the correct keyword arguments to the reader. | ||
See Also | ||
-------- | ||
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html | ||
- koheesio.pandas.readers.excel.ExcelReader | ||
""" | ||
|
||
from pyspark.pandas import DataFrame as PandasDataFrame | ||
|
||
from koheesio.pandas.readers.excel import ExcelReader as PandasExcelReader | ||
from koheesio.spark.readers import Reader | ||
|
||
|
||
class ExcelReader(Reader, PandasExcelReader): | ||
"""Read data from an Excel file | ||
This class is a wrapper around the PandasExcelReader class. It reads an Excel file first using pandas, and then | ||
converts the pandas DataFrame to a Spark DataFrame. | ||
Attributes | ||
---------- | ||
path: str | ||
The path to the Excel file | ||
sheet_name: str | ||
The name of the sheet to read | ||
header: int | ||
The row to use as the column names | ||
""" | ||
|
||
def execute(self): | ||
pdf: PandasDataFrame = PandasExcelReader.from_step(self).execute().df | ||
self.output.df = self.spark.createDataFrame(pdf) |
Oops, something went wrong.