The Polars Cheat Sheet!

The most used Polars commands and patterns when doing (business) data analytics.
by Kai Altstaedt (-> Back to Home)

Typical Operations on DataFrame Level

Mainly taken from Pola.rs

Join


# join with common column name
df = df.join(
    input_b.select(
        "bar",
        "buzz",
    ),
    on="foo",
    how="left"
)

# with different names
df = df.join(
    input_b.select(
        "foo",
        "bar",
        "buzz",
    ).rename({"bar": "new_name"}), # to allow a simple on-clause,
    on=["foo", "new_name"],
    how="left"
)

Filter

Basics

# simple filter
df = df.filter(pl.col('id') == '0366862')
df= df.filter(pl.col('foo').str.starts_with('A'))
df = df.filter((pl.col("bar") == 1) | (pl.col("buzz") == "c"))


# typical moves , including null
df = input_a.filter(
    (pl.col('buzz') == pl.lit('hamburg'))
    & (pl.col("foo") > 13000)
    & (pl.col("a_date").is_not_null())
    & (pl.col("a_second_date").is_null())
)

Lists

# filter for lists
df.filter(pl.col("buzz").is_in(['OPEN', 'IN_WORK']))

# in / out
df.filter(~pl.col("buzz").is_in(['OPEN', 'IN_WORK']))
df.filter(pl.col("buzz").is_in(['OPEN', 'IN_WORK']).not_())

Dates

from datetime import datetime

# in the future
print(df.filter(pl.col('start') > datetime.now()))

# between now and now + 2 Weeks
df.filter(
    (pl.col('start') > datetime.now())
    & (pl.col('start') < datetime.now() + timedelta(weeks = 3))
)

Column Selection, Renaming, Uniqueness

Column Selection

# select  / drop a subset of columns
df = df.select("foo", "bar", "buzz")
df = df.drop("buzz")

# rename
df = df.rename({"old_name": "new_name"})
df2 = df.rename({
    'Courses': 'Course_List',
    'Fees': 'Course_Fee',
    'Duration': 'Course_Duration',
    'Discount': 'Course_Discount'
})

Uniqueness

# basic
df.unique()
df.unique(maintain_order=True)

# drive uniqueness by a set of columns
df.unique(subset=["bar", "buzz"], maintain_order=True) 

# force unique primary key "bar", keep newest
df.sort(["bar", "date_of_creation"], descending=[False, True])\
    .unique(subset=["bar"], keep="last", maintain_order=True)

Union

# parallel-mode only relevant for lazy
new_df = pl.concat([input_a, input_b], parallel=True)

Calculated columns

Basics

# replace column with calculated value
df = df.with_columns(pl.col("a").cast(pl.Float64))

# create a new column
df = df.with_columns((pl.col("a") ** 2).alias("a_square"))

# alternative way to write
df = df.with_columns(
    a_square = (pl.col("a") ** 2)
)

Conditions

# The if-then-else ..., with a replacement of the column
#
# Example is a classic cleaning action, that sets all empty Strings
# or Strings containg "NULL" to real Null / None
df = df.with_columns(
    pl.when((pl.col("foo") == '') | (pl.col("foo") == 'NULL'))
      .then(pl.lit(None))
      .otherwise(pl.col("foo"))
    .alias("foo")
)

# multiple conditions (comparable to if ... elif ... elif...
df = df.with_columns(
   pl.when(pl.col("foo") > 2)
        .then(1)
     .when(pl.col("bar") > 2)
        .then(4)
     .otherwise(-1)
  .alias("val")
)

Strings

# Concatentation
df = df.with_columns(concat = pl.concat_str(pl.col("foo"), pl.lit('###'), pl.col("bar")))

# Formatting of strings
str.to_datetime("%Y-%m-%d %H:%M%#z")

Replace

# basic
df = df.with_columns(pl.col('foo').str.replace(pl.lit('old_string'), pl.lit('new_string')))

# regex
df = df.with_columns(pl.col('foo').str.replace(r'HAM\d', pl.lit('Hamburg')))

Casting of Columns

# Default behaviour ist to produce an error if the cast fails. 
# Using strict=False turns it to the behaviour that is the default in Contour and PySpark
df = df.with_columns(
    pl.col("foo")
      .cast(pl.Int32, strict=False)
      .cast(pl.String)
      .str.zfill(12)  # equivalent to a lpad with '0'
      .alias("bazz_cast")

Empty columns of all kinds

# creation of empty columns
df = df.with_columns(
    arr_1=pl.lit([]),
    arr_2=pl.lit(None).cast(pl.Array(width=1, inner=pl.String)),
    str_1=pl.lit(None).cast(pl.String),
    bool_1=pl.lit(None).cast(pl.Boolean),
    int_1=pl.lit(None).cast(pl.Int32)
)

Coalesce

# coalesce
df = df.with_columns(pl.coalesce(["foo", "bar"]).alias("foo"))

Working with arrays: join, split, explode

# split by a character
df = df.with_columns(pl.col("foo").str.split(',').alias("split"))

# then join with a delimiter
df = df.with_columns(joined_col = pl.col("split").list.join(";"))

# explode an array-column
df = df.explode("split")

# typical move: Trim the spaces at begin and end
df = df.with_columns(pl.col("split").str.strip_chars())

Window functions / Grouping

The "array_agg"

# aggregate an array
df = df.group_by("foo", "bar").agg(
    pl.col("buzz")
)
# aggregate an array and do the join with comma immediately
df = df.group_by(["foo", "bar"]).agg(
    pl.col("buzz").str.join(",")
)

# if you want to have a array_agg_distinct of "buzz", drive uniqueness before!
df = (
    df
    .unique(subset=["foo", "bar", "buzz"])
    .group_by("foo", "bar")
    .agg(pl.col("buzz").str.join(",").alias("buzz"))
)

2015 Kai Altstaedt -

Website template by Arcsin