Mainly taken from Pola.rs
# join with common column name
df = df.join(
input_b.select(
"bar",
"buzz",
),
on="foo",
how="left"
)
# with different names
df = df.join(
input_b.select(
"foo",
"bar",
"buzz",
).rename({"bar": "new_name"}), # to allow a simple on-clause,
on=["foo", "new_name"],
how="left"
)
# simple filter
df = df.filter(pl.col('id') == '0366862')
df= df.filter(pl.col('foo').str.starts_with('A'))
df = df.filter((pl.col("bar") == 1) | (pl.col("buzz") == "c"))
# typical moves , including null
df = input_a.filter(
(pl.col('buzz') == pl.lit('hamburg'))
& (pl.col("foo") > 13000)
& (pl.col("a_date").is_not_null())
& (pl.col("a_second_date").is_null())
)
# filter for lists
df.filter(pl.col("buzz").is_in(['OPEN', 'IN_WORK']))
# in / out
df.filter(~pl.col("buzz").is_in(['OPEN', 'IN_WORK']))
df.filter(pl.col("buzz").is_in(['OPEN', 'IN_WORK']).not_())
from datetime import datetime
# in the future
print(df.filter(pl.col('start') > datetime.now()))
# between now and now + 2 Weeks
df.filter(
(pl.col('start') > datetime.now())
& (pl.col('start') < datetime.now() + timedelta(weeks = 3))
)
# select / drop a subset of columns
df = df.select("foo", "bar", "buzz")
df = df.drop("buzz")
# rename
df = df.rename({"old_name": "new_name"})
df2 = df.rename({
'Courses': 'Course_List',
'Fees': 'Course_Fee',
'Duration': 'Course_Duration',
'Discount': 'Course_Discount'
})
# basic
df.unique()
df.unique(maintain_order=True)
# drive uniqueness by a set of columns
df.unique(subset=["bar", "buzz"], maintain_order=True)
# force unique primary key "bar", keep newest
df.sort(["bar", "date_of_creation"], descending=[False, True])\
.unique(subset=["bar"], keep="last", maintain_order=True)
# parallel-mode only relevant for lazy
new_df = pl.concat([input_a, input_b], parallel=True)
# replace column with calculated value
df = df.with_columns(pl.col("a").cast(pl.Float64))
# create a new column
df = df.with_columns((pl.col("a") ** 2).alias("a_square"))
# alternative way to write
df = df.with_columns(
a_square = (pl.col("a") ** 2)
)
# The if-then-else ..., with a replacement of the column
#
# Example is a classic cleaning action, that sets all empty Strings
# or Strings containg "NULL" to real Null / None
df = df.with_columns(
pl.when((pl.col("foo") == '') | (pl.col("foo") == 'NULL'))
.then(pl.lit(None))
.otherwise(pl.col("foo"))
.alias("foo")
)
# multiple conditions (comparable to if ... elif ... elif...
df = df.with_columns(
pl.when(pl.col("foo") > 2)
.then(1)
.when(pl.col("bar") > 2)
.then(4)
.otherwise(-1)
.alias("val")
)
# Concatentation
df = df.with_columns(concat = pl.concat_str(pl.col("foo"), pl.lit('###'), pl.col("bar")))
# Formatting of strings
str.to_datetime("%Y-%m-%d %H:%M%#z")
# basic
df = df.with_columns(pl.col('foo').str.replace(pl.lit('old_string'), pl.lit('new_string')))
# regex
df = df.with_columns(pl.col('foo').str.replace(r'HAM\d', pl.lit('Hamburg')))
# Default behaviour ist to produce an error if the cast fails.
# Using strict=False turns it to the behaviour that is the default in Contour and PySpark
df = df.with_columns(
pl.col("foo")
.cast(pl.Int32, strict=False)
.cast(pl.String)
.str.zfill(12) # equivalent to a lpad with '0'
.alias("bazz_cast")
# creation of empty columns
df = df.with_columns(
arr_1=pl.lit([]),
arr_2=pl.lit(None).cast(pl.Array(width=1, inner=pl.String)),
str_1=pl.lit(None).cast(pl.String),
bool_1=pl.lit(None).cast(pl.Boolean),
int_1=pl.lit(None).cast(pl.Int32)
)
# coalesce
df = df.with_columns(pl.coalesce(["foo", "bar"]).alias("foo"))
# split by a character
df = df.with_columns(pl.col("foo").str.split(',').alias("split"))
# then join with a delimiter
df = df.with_columns(joined_col = pl.col("split").list.join(";"))
# explode an array-column
df = df.explode("split")
# typical move: Trim the spaces at begin and end
df = df.with_columns(pl.col("split").str.strip_chars())
# aggregate an array
df = df.group_by("foo", "bar").agg(
pl.col("buzz")
)
# aggregate an array and do the join with comma immediately
df = df.group_by(["foo", "bar"]).agg(
pl.col("buzz").str.join(",")
)
# if you want to have a array_agg_distinct of "buzz", drive uniqueness before!
df = (
df
.unique(subset=["foo", "bar", "buzz"])
.group_by("foo", "bar")
.agg(pl.col("buzz").str.join(",").alias("buzz"))
)