import ibis
= ibis.connect("duckdb://nycflights13.ddb")
con
con.create_table("flights", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True
)
con.create_table("weather", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True
)
Preprocess your data with recipes
Prepare data for modeling with modular preprocessing steps.
Introduction
In this article, we’ll explore Recipe
s, which are designed to help you preprocess your data before training your model. Recipes are built as a series of preprocessing steps, such as:
converting qualitative predictors to indicator variables (also known as dummy variables),
transforming data to be on a different scale (e.g., taking the logarithm of a variable),
transforming whole groups of predictors together,
extracting key features from raw variables (e.g., getting the day of the week out of a date variable),
and so on. If you are familiar with scikit-learn’s dataset transformations, a lot of this might sound familiar and like what a transformer already does. Recipes can be used to do many of the same things, but they can scale your workloads on any Ibis-supported backend. This article shows how to use recipes for modeling.
To use code in this article, you will need to install the following packages: Ibis, IbisML, and skorch, a high-level library for PyTorch that provides full scikit-learn compatibility.
pip install 'ibis-framework[duckdb,examples]' ibis-ml skorch torch
The New York City flight data
Let’s use the nycflights13 data to predict whether a plane arrives more than 30 minutes late. This dataset contains information on 325,819 flights departing near New York City in 2013. Let’s start by loading the data and making a few changes to the variables:
You can now see the example dataset copied over to the database:
= ibis.connect("duckdb://nycflights13.ddb")
con con.list_tables()
['flights', 'weather']
We’ll turn on interactive mode, which partially executes queries to give users a preview of the results.
= True ibis.options.interactive
= con.table("flights")
flights = flights.mutate(
flights =(
dep_time4, "0").substr(0, 2)
flights.dep_time.lpad(+ ":"
+ flights.dep_time.substr(-2, 2)
+ ":00"
"time"),
).try_cast(=flights.arr_delay.try_cast(int),
arr_delay=flights.air_time.try_cast(int),
air_time
) flights
┏━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ year ┃ month ┃ day ┃ dep_time ┃ sched_dep_time ┃ dep_delay ┃ arr_time ┃ sched_arr_time ┃ arr_delay ┃ carrier ┃ flight ┃ tailnum ┃ origin ┃ dest ┃ air_time ┃ distance ┃ hour ┃ minute ┃ time_hour ┃ ┡━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ int64 │ time │ int64 │ string │ string │ int64 │ int64 │ string │ int64 │ string │ string │ string │ int64 │ int64 │ int64 │ int64 │ timestamp(6) │ ├───────┼───────┼───────┼──────────┼────────────────┼───────────┼──────────┼────────────────┼───────────┼─────────┼────────┼─────────┼────────┼────────┼──────────┼──────────┼───────┼────────┼─────────────────────┤ │ 2013 │ 1 │ 1 │ 05:17:00 │ 515 │ 2 │ 830 │ 819 │ 11 │ UA │ 1545 │ N14228 │ EWR │ IAH │ 227 │ 1400 │ 5 │ 15 │ 2013-01-01 10:00:00 │ │ 2013 │ 1 │ 1 │ 05:33:00 │ 529 │ 4 │ 850 │ 830 │ 20 │ UA │ 1714 │ N24211 │ LGA │ IAH │ 227 │ 1416 │ 5 │ 29 │ 2013-01-01 10:00:00 │ │ 2013 │ 1 │ 1 │ 05:42:00 │ 540 │ 2 │ 923 │ 850 │ 33 │ AA │ 1141 │ N619AA │ JFK │ MIA │ 160 │ 1089 │ 5 │ 40 │ 2013-01-01 10:00:00 │ │ 2013 │ 1 │ 1 │ 05:44:00 │ 545 │ -1 │ 1004 │ 1022 │ -18 │ B6 │ 725 │ N804JB │ JFK │ BQN │ 183 │ 1576 │ 5 │ 45 │ 2013-01-01 10:00:00 │ │ 2013 │ 1 │ 1 │ 05:54:00 │ 600 │ -6 │ 812 │ 837 │ -25 │ DL │ 461 │ N668DN │ LGA │ ATL │ 116 │ 762 │ 6 │ 0 │ 2013-01-01 11:00:00 │ │ 2013 │ 1 │ 1 │ 05:54:00 │ 558 │ -4 │ 740 │ 728 │ 12 │ UA │ 1696 │ N39463 │ EWR │ ORD │ 150 │ 719 │ 5 │ 58 │ 2013-01-01 10:00:00 │ │ 2013 │ 1 │ 1 │ 05:55:00 │ 600 │ -5 │ 913 │ 854 │ 19 │ B6 │ 507 │ N516JB │ EWR │ FLL │ 158 │ 1065 │ 6 │ 0 │ 2013-01-01 11:00:00 │ │ 2013 │ 1 │ 1 │ 05:57:00 │ 600 │ -3 │ 709 │ 723 │ -14 │ EV │ 5708 │ N829AS │ LGA │ IAD │ 53 │ 229 │ 6 │ 0 │ 2013-01-01 11:00:00 │ │ 2013 │ 1 │ 1 │ 05:57:00 │ 600 │ -3 │ 838 │ 846 │ -8 │ B6 │ 79 │ N593JB │ JFK │ MCO │ 140 │ 944 │ 6 │ 0 │ 2013-01-01 11:00:00 │ │ 2013 │ 1 │ 1 │ 05:58:00 │ 600 │ -2 │ 753 │ 745 │ 8 │ AA │ 301 │ N3ALAA │ LGA │ ORD │ 138 │ 733 │ 6 │ 0 │ 2013-01-01 11:00:00 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └───────┴───────┴───────┴──────────┴────────────────┴───────────┴──────────┴────────────────┴───────────┴─────────┴────────┴─────────┴────────┴────────┴──────────┴──────────┴───────┴────────┴─────────────────────┘
= con.table("weather")
weather weather
┏━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ origin ┃ year ┃ month ┃ day ┃ hour ┃ temp ┃ dewp ┃ humid ┃ wind_dir ┃ wind_speed ┃ wind_gust ┃ precip ┃ pressure ┃ visib ┃ time_hour ┃ ┡━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ string │ int64 │ int64 │ int64 │ int64 │ string │ string │ string │ string │ string │ string │ float64 │ string │ float64 │ timestamp(6) │ ├────────┼───────┼───────┼───────┼───────┼────────┼────────┼────────┼──────────┼────────────────────┼───────────┼─────────┼──────────┼─────────┼─────────────────────┤ │ EWR │ 2013 │ 1 │ 1 │ 1 │ 39.02 │ 26.06 │ 59.37 │ 270 │ 10.357019999999999 │ NA │ 0.0 │ 1012 │ 10.0 │ 2013-01-01 06:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 2 │ 39.02 │ 26.96 │ 61.63 │ 250 │ 8.05546 │ NA │ 0.0 │ 1012.3 │ 10.0 │ 2013-01-01 07:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 3 │ 39.02 │ 28.04 │ 64.43 │ 240 │ 11.5078 │ NA │ 0.0 │ 1012.5 │ 10.0 │ 2013-01-01 08:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 4 │ 39.92 │ 28.04 │ 62.21 │ 250 │ 12.658579999999999 │ NA │ 0.0 │ 1012.2 │ 10.0 │ 2013-01-01 09:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 5 │ 39.02 │ 28.04 │ 64.43 │ 260 │ 12.658579999999999 │ NA │ 0.0 │ 1011.9 │ 10.0 │ 2013-01-01 10:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 6 │ 37.94 │ 28.04 │ 67.21 │ 240 │ 11.5078 │ NA │ 0.0 │ 1012.4 │ 10.0 │ 2013-01-01 11:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 7 │ 39.02 │ 28.04 │ 64.43 │ 240 │ 14.960139999999999 │ NA │ 0.0 │ 1012.2 │ 10.0 │ 2013-01-01 12:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 8 │ 39.92 │ 28.04 │ 62.21 │ 250 │ 10.357019999999999 │ NA │ 0.0 │ 1012.2 │ 10.0 │ 2013-01-01 13:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 9 │ 39.92 │ 28.04 │ 62.21 │ 260 │ 14.960139999999999 │ NA │ 0.0 │ 1012.7 │ 10.0 │ 2013-01-01 14:00:00 │ │ EWR │ 2013 │ 1 │ 1 │ 10 │ 41 │ 28.04 │ 59.65 │ 260 │ 13.809359999999998 │ NA │ 0.0 │ 1012.4 │ 10.0 │ 2013-01-01 15:00:00 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └────────┴───────┴───────┴───────┴───────┴────────┴────────┴────────┴──────────┴────────────────────┴───────────┴─────────┴──────────┴─────────┴─────────────────────┘
= (
flight_data
flights.mutate(# Convert the arrival delay to a factor
# By default, PyTorch expects the target to have a Long datatype
=ibis.ifelse(flights.arr_delay >= 30, 1, 0).cast("int64"),
arr_delay# We will use the date (not date-time) in the recipe below
=flights.time_hour.date(),
date
)# Include the weather data
"origin", "time_hour"])
.inner_join(weather, [# Only retain the specific columns we will use
.select("dep_time",
"flight",
"origin",
"dest",
"air_time",
"distance",
"carrier",
"date",
"arr_delay",
"time_hour",
)# Exclude missing data
.drop_null()
) flight_data
┏━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ dep_time ┃ flight ┃ origin ┃ dest ┃ air_time ┃ distance ┃ carrier ┃ date ┃ arr_delay ┃ time_hour ┃ ┡━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ time │ int64 │ string │ string │ int64 │ int64 │ string │ date │ int64 │ timestamp(6) │ ├──────────┼────────┼────────┼────────┼──────────┼──────────┼─────────┼────────────┼───────────┼─────────────────────┤ │ 05:17:00 │ 1545 │ EWR │ IAH │ 227 │ 1400 │ UA │ 2013-01-01 │ 0 │ 2013-01-01 10:00:00 │ │ 05:33:00 │ 1714 │ LGA │ IAH │ 227 │ 1416 │ UA │ 2013-01-01 │ 0 │ 2013-01-01 10:00:00 │ │ 05:42:00 │ 1141 │ JFK │ MIA │ 160 │ 1089 │ AA │ 2013-01-01 │ 1 │ 2013-01-01 10:00:00 │ │ 05:44:00 │ 725 │ JFK │ BQN │ 183 │ 1576 │ B6 │ 2013-01-01 │ 0 │ 2013-01-01 10:00:00 │ │ 05:54:00 │ 461 │ LGA │ ATL │ 116 │ 762 │ DL │ 2013-01-01 │ 0 │ 2013-01-01 11:00:00 │ │ 05:54:00 │ 1696 │ EWR │ ORD │ 150 │ 719 │ UA │ 2013-01-01 │ 0 │ 2013-01-01 10:00:00 │ │ 05:55:00 │ 507 │ EWR │ FLL │ 158 │ 1065 │ B6 │ 2013-01-01 │ 0 │ 2013-01-01 11:00:00 │ │ 05:57:00 │ 5708 │ LGA │ IAD │ 53 │ 229 │ EV │ 2013-01-01 │ 0 │ 2013-01-01 11:00:00 │ │ 05:57:00 │ 79 │ JFK │ MCO │ 140 │ 944 │ B6 │ 2013-01-01 │ 0 │ 2013-01-01 11:00:00 │ │ 05:58:00 │ 301 │ LGA │ ORD │ 138 │ 733 │ AA │ 2013-01-01 │ 0 │ 2013-01-01 11:00:00 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────┴────────┴────────┴────────┴──────────┴──────────┴─────────┴────────────┴───────────┴─────────────────────┘
We can see that about 16% of the flights in this dataset arrived more than 30 minutes late.
="arr_delay_count").mutate(
flight_data.arr_delay.value_counts().rename(n=ibis._.n / ibis._.n.sum()
prop )
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓ ┃ arr_delay ┃ n ┃ prop ┃ ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩ │ int64 │ int64 │ float64 │ ├───────────┼────────┼──────────┤ │ 0 │ 273279 │ 0.838745 │ │ 1 │ 52540 │ 0.161255 │ └───────────┴────────┴──────────┘
Data splitting
To get started, let’s split this single dataset into two: a training set and a testing set. We’ll keep most of the rows in the original dataset (subset chosen randomly) in the training set. The training data will be used to fit the model, and the testing set will be used to measure model performance.
Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.
import ibis_ml as ml
# Create data frames for the two sets:
= ml.train_test_split(
train_data, test_data
flight_data,=["carrier", "flight", "date"],
unique_key# Put 3/4 of the data into the training set
=0.25,
test_size=4,
num_buckets# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible when random numbers are used
=222,
random_seed )
Create features
= ml.Recipe(
flights_rec "date", components=["dow", "month"]),
ml.ExpandDate("date"),
ml.Drop(
ml.TargetEncode(ml.nominal()),
ml.DropZeroVariance(ml.everything()),"dep_time", ibis._.hour() * 60 + ibis._.minute()),
ml.MutateAt(
ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()),# By default, PyTorch requires that the type of `X` is `np.float32`.
# https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2
"float32"),
ml.Cast(ml.numeric(), )
Fit a model with a recipe
Let’s model the flight data. We can use any scikit-learn-compatible estimator.
We will want to use our recipe across several steps as we train and test our model. We will:
Process the recipe using the training set: This involves any estimation or calculations based on the training set. For our recipe, the training set will be used to determine which predictors should be converted to dummy variables and which predictors will have zero-variance in the training set, and should be slated for removal.
Apply the recipe to the training set: We create the final predictor set on the training set.
Apply the recipe to the test set: We create the final predictor set on the test set. Nothing is recomputed and no information from the test set is used here; the dummy variable and zero-variance results from the training set are applied to the test set.
To simplify this process, we can use a scikit-learn Pipeline
.
from sklearn.pipeline import Pipeline
from skorch import NeuralNetClassifier
from torch import nn
class MyModule(nn.Module):
def __init__(self, num_units=10, nonlin=nn.ReLU()):
super().__init__()
self.dense0 = nn.Linear(10, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(0.5)
self.dense1 = nn.Linear(num_units, num_units)
self.output = nn.Linear(num_units, 2)
self.softmax = nn.Softmax(dim=-1)
def forward(self, X, **kwargs):
= self.nonlin(self.dense0(X))
X = self.dropout(X)
X = self.nonlin(self.dense1(X))
X = self.softmax(self.output(X))
X return X
= NeuralNetClassifier(
net
MyModule,=10,
max_epochs=0.1,
lr# Shuffle training data on each epoch
=True,
iterator_train__shuffle
)
= Pipeline([("flights_rec", flights_rec), ("net", net)]) pipe
Now, there is a single function that can be used to prepare the recipe and train the model from the resulting predictors:
= train_data.drop("arr_delay")
X_train = train_data.arr_delay
y_train pipe.fit(X_train, y_train)
epoch train_loss valid_acc valid_loss dur
------- ------------ ----------- ------------ ------
1 6.5582 0.8386 2.5725 2.3141
2 5.6226 0.8386 2.5725 2.2893
3 5.4017 0.8386 2.5725 2.2692
4 5.4062 0.8386 2.5725 2.2739
5 5.4181 0.8386 2.5725 2.2667
6 5.3982 0.8386 2.5725 2.2686
7 5.4326 0.8386 2.5725 2.2678
8 5.3825 0.8386 2.5725 2.2839
9 3.7977 0.8386 2.5725 2.2929
10 2.5037 0.8386 2.5725 2.2969
Pipeline(steps=[('flights_rec', Recipe(ExpandDate(cols(('date',)), components=['dow', 'month']), Drop(cols(('date',))), TargetEncode(nominal(), smooth=0.0), DropZeroVariance(everything(), tolerance=0.0001), MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())), MutateAt(timestamp(), _.epoch_seconds()), Cast(numeric(), 'float32'))), ('net', <class 'skorch.classifier.NeuralNetClassifier'>[initialized]( module_=MyModule( (dense0): Linear(in_features=10, out_features=10, bias=True) (nonlin): ReLU() (dropout): Dropout(p=0.5, inplace=False) (dense1): Linear(in_features=10, out_features=10, bias=True) (output): Linear(in_features=10, out_features=2, bias=True) (softmax): Softmax(dim=-1) ), ))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('flights_rec', Recipe(ExpandDate(cols(('date',)), components=['dow', 'month']), Drop(cols(('date',))), TargetEncode(nominal(), smooth=0.0), DropZeroVariance(everything(), tolerance=0.0001), MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())), MutateAt(timestamp(), _.epoch_seconds()), Cast(numeric(), 'float32'))), ('net', <class 'skorch.classifier.NeuralNetClassifier'>[initialized]( module_=MyModule( (dense0): Linear(in_features=10, out_features=10, bias=True) (nonlin): ReLU() (dropout): Dropout(p=0.5, inplace=False) (dense1): Linear(in_features=10, out_features=10, bias=True) (output): Linear(in_features=10, out_features=2, bias=True) (softmax): Softmax(dim=-1) ), ))])
Recipe(ExpandDate(cols(('date',)), components=['dow', 'month']), Drop(cols(('date',))), TargetEncode(nominal(), smooth=0.0), DropZeroVariance(everything(), tolerance=0.0001), MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute())), MutateAt(timestamp(), _.epoch_seconds()), Cast(numeric(), 'float32'))
ExpandDate(cols(('date',)), components=['dow', 'month'])
Drop(cols(('date',)))
TargetEncode(nominal(), smooth=0.0)
DropZeroVariance(everything(), tolerance=0.0001)
MutateAt(cols(('dep_time',)), ((_.hour() * 60) + _.minute()))
MutateAt(timestamp(), _.epoch_seconds())
Cast(numeric(), 'float32')
<class 'skorch.classifier.NeuralNetClassifier'>[initialized]( module_=MyModule( (dense0): Linear(in_features=10, out_features=10, bias=True) (nonlin): ReLU() (dropout): Dropout(p=0.5, inplace=False) (dense1): Linear(in_features=10, out_features=10, bias=True) (output): Linear(in_features=10, out_features=2, bias=True) (softmax): Softmax(dim=-1) ), )
Use a trained workflow to predict
…
= test_data.drop("arr_delay")
X_test = test_data.arr_delay
y_test pipe.score(X_test, y_test)
0.8390849833968762
Acknowledgments
This tutorial is derived from the tidymodels article of the same name. The transformation logic is very similar, and much of the text is copied verbatim.