.printSchema(
    colWidth: int = 30,
    toConsole: bool = True
) -> None
# Prints out the schema in the tree format.

.repartition(
    numPartitions: int,
    *cols: str
) -> DataFrame
# Returns a new DataFrame partitioned by the given partitioning expressions.

SparkContext(
    master: str = None,
    appName: str = None,
    sparkHome: str = None,
    pyFiles: str = None,
    environment: Dict[str, str] = None,
    batchSize: int = 0,
    serializer: str = PickleSerializer(),
    conf: SparkConf = None,
    gateway: Gateway = None,
    jsc: JavaObject = None,
    profiler_cls: str = 'BasicProfiler'
)
# The entry point to programming Spark with the Dataset and DataFrame API.

.join(
    other: 'DataFrame',
    on: Union[str, List[str], Column, List[Column]],
    how: str = 'inner' | 'outer' | 'left_outer' | 'right_outer' | 'leftsemi'
) -> 'DataFrame'
# Joins with another DataFrame, using the given join expression.

.dropna(
    how: str = 'any' | 'all',
    thresh: int = None,
    subset: Union[str, Tuple[str, ...]] = None
) -> 'DataFrame'
# Returns a new DataFrame omitting rows with null values.

from pyspark import StorageLevel

df.persist(StorageLevel.MEMORY_ONLY)

# Note that the storage level MEMORY_ONLY means that all partitions that do not fit into memory will be recomputed when they are needed.

.persist(
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
) -> DataFrame
# persist() marks a DataFrame as eligible for caching (similar to cache()).

.cache(
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK
) -> DataFrame
# Persist this DataFrame with the default storage level (MEMORY_AND_DISK).

.explode(
    column: Union[str, Column]
) -> DataFrame
# Creates a new row for each element in the given array or map column.

.split(
    pattern: str,
    limit: int = -1
) -> List[str]
# Splits str around pattern (pattern is a regular expression).

# Example:
df.select(split(df['name'], ' ').alias('s')).collect()

.coalesce(
    numPartitions: int
) -> DataFrame
# Returns a new DataFrame that has exactly numPartitions partitions.

.select(
    *cols: str
) -> DataFrame
# Projects a set of expressions and returns a new DataFrame.

.alias(
    alias: str
) -> DataFrame
# Returns a new DataFrame with an alias set.

# Example:
df.select(df.age.alias("age2")).collect()

spark.read.parquet('path/to/file.parquet')
# Loads Parquet files, returning the result as a DataFrame.

.col(
    colName: str
) -> Column
# Returns a Column based on the given column name.

.take(
    num: int
) -> List[Row]
# Returns the first num rows as a list of Row.

.filter(
    condition: Union[str, Column]
) -> DataFrame
# Filters rows using the given condition.

# Example:
df.filter(df.age > 3).collect()

.collect(
) -> List[Row]
# Returns all the records as a list of Row.

.withColumn(
    colName: str,
    col: Union[Column, str]
) -> DataFrame
# Returns a new DataFrame by adding a column or replacing the existing column that has the same name.

.where(
    condition: Union[str, Column]
) -> DataFrame
# Filters rows using the given condition.

.write.parquet('path/to/file.parquet')

# Saves the content of the DataFrame in Parquet format at the specified path.

.groupBy(
    *cols: str
) -> GroupedData
# Groups the DataFrame using the specified columns, so we can run aggregation on them.

.sort(
    *cols: str,
    **kwargs
) -> DataFrame

# Returns a new DataFrame sorted by the specified column(s).

# Example:
df.sort(df.age.desc()).collect()

agg(
    *exprs: Union[Column, str]
) -> DataFrame
# Aggregate on the entire DataFrame without groups (shorthand for df.groupBy.agg()).

avg(
    *cols: str
) -> DataFrame
# Compute the mean value for each numeric columns for each group.

.withColumnRenamed(
    existing: str,
    new: str
) -> DataFrame
# Returns a new DataFrame by renaming an existing column.

.orderBy(
    *cols: Union[Column, str],
    **kwargs
) -> DataFrame
# Returns a new DataFrame sorted by the specified column(s).

.broadcast(
    df: DataFrame
) -> DataFrame
# Marks a DataFrame as small enough for use in broadcast joins.

.partitionBy(
    *cols: Union[str, Column]
) -> DataFrame
# Partitions the output by the given columns on the file system.

.drop(
    *cols: str
) -> DataFrame
# Returns a new DataFrame that drops the specified column.
# It is important to know that the drop operator expects column names to be passed as string arguments if multiple columns should be removed.

.count(
) -> int
# Returns the number of rows in this DataFrame.

.distinct(
) -> DataFrame
# Returns a new DataFrame containing the distinct rows in this DataFrame.

dropDuplicates(
    subset: Union[str, List[str]] = None
) -> DataFrame
# Return a new DataFrame with duplicate rows removed, optionally only considering certain columns.

lit(
    col: Union[str, int, float, bool]
) -> Column
# Creates a Column of literal value.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

useful-commands.md

useful-commands.md

Files

useful-commands.md

Latest commit

History

useful-commands.md

File metadata and controls