.printSchema(
colWidth: int = 30,
toConsole: bool = True
) -> None
# Prints out the schema in the tree format.
.repartition(
numPartitions: int,
*cols: str
) -> DataFrame
# Returns a new DataFrame partitioned by the given partitioning expressions.
SparkContext(
master: str = None,
appName: str = None,
sparkHome: str = None,
pyFiles: str = None,
environment: Dict[str, str] = None,
batchSize: int = 0,
serializer: str = PickleSerializer(),
conf: SparkConf = None,
gateway: Gateway = None,
jsc: JavaObject = None,
profiler_cls: str = 'BasicProfiler'
)
# The entry point to programming Spark with the Dataset and DataFrame API.
.join(
other: 'DataFrame',
on: Union[str, List[str], Column, List[Column]],
how: str = 'inner' | 'outer' | 'left_outer' | 'right_outer' | 'leftsemi'
) -> 'DataFrame'
# Joins with another DataFrame, using the given join expression.
.dropna(
how: str = 'any' | 'all',
thresh: int = None,
subset: Union[str, Tuple[str, ...]] = None
) -> 'DataFrame'
# Returns a new DataFrame omitting rows with null values.
from pyspark import StorageLevel
df.persist(StorageLevel.MEMORY_ONLY)
# Note that the storage level MEMORY_ONLY means that all partitions that do not fit into memory will be recomputed when they are needed.
.persist(
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
) -> DataFrame
# persist() marks a DataFrame as eligible for caching (similar to cache()).
.cache(
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
) -> DataFrame
# Persist this DataFrame with the default storage level (MEMORY_AND_DISK).
.explode(
column: Union[str, Column]
) -> DataFrame
# Creates a new row for each element in the given array or map column.
.split(
pattern: str,
limit: int = -1
) -> List[str]
# Splits str around pattern (pattern is a regular expression).
# Example:
df.select(split(df['name'], ' ').alias('s')).collect()
.coalesce(
numPartitions: int
) -> DataFrame
# Returns a new DataFrame that has exactly numPartitions partitions.
.select(
*cols: str
) -> DataFrame
# Projects a set of expressions and returns a new DataFrame.
.alias(
alias: str
) -> DataFrame
# Returns a new DataFrame with an alias set.
# Example:
df.select(df.age.alias("age2")).collect()
spark.read.parquet('path/to/file.parquet')
# Loads Parquet files, returning the result as a DataFrame.
.col(
colName: str
) -> Column
# Returns a Column based on the given column name.
.take(
num: int
) -> List[Row]
# Returns the first num rows as a list of Row.
.filter(
condition: Union[str, Column]
) -> DataFrame
# Filters rows using the given condition.
# Example:
df.filter(df.age > 3).collect()
.collect(
) -> List[Row]
# Returns all the records as a list of Row.
.withColumn(
colName: str,
col: Union[Column, str]
) -> DataFrame
# Returns a new DataFrame by adding a column or replacing the existing column that has the same name.
.where(
condition: Union[str, Column]
) -> DataFrame
# Filters rows using the given condition.
.write.parquet('path/to/file.parquet')
# Saves the content of the DataFrame in Parquet format at the specified path.
.groupBy(
*cols: str
) -> GroupedData
# Groups the DataFrame using the specified columns, so we can run aggregation on them.
.sort(
*cols: str,
**kwargs
) -> DataFrame
# Returns a new DataFrame sorted by the specified column(s).
# Example:
df.sort(df.age.desc()).collect()
agg(
*exprs: Union[Column, str]
) -> DataFrame
# Aggregate on the entire DataFrame without groups (shorthand for df.groupBy.agg()).
avg(
*cols: str
) -> DataFrame
# Compute the mean value for each numeric columns for each group.
.withColumnRenamed(
existing: str,
new: str
) -> DataFrame
# Returns a new DataFrame by renaming an existing column.
.orderBy(
*cols: Union[Column, str],
**kwargs
) -> DataFrame
# Returns a new DataFrame sorted by the specified column(s).
.broadcast(
df: DataFrame
) -> DataFrame
# Marks a DataFrame as small enough for use in broadcast joins.
.partitionBy(
*cols: Union[str, Column]
) -> DataFrame
# Partitions the output by the given columns on the file system.
.drop(
*cols: str
) -> DataFrame
# Returns a new DataFrame that drops the specified column.
# It is important to know that the drop operator expects column names to be passed as string arguments if multiple columns should be removed.
.count(
) -> int
# Returns the number of rows in this DataFrame.
.distinct(
) -> DataFrame
# Returns a new DataFrame containing the distinct rows in this DataFrame.
dropDuplicates(
subset: Union[str, List[str]] = None
) -> DataFrame
# Return a new DataFrame with duplicate rows removed, optionally only considering certain columns.
lit(
col: Union[str, int, float, bool]
) -> Column
# Creates a Column of literal value.