Creating and Inserting Data¶
Setup¶
import os
import tempfile
from pathlib import Path
from urllib.request import urlretrieve
tempdir = Path(tempfile.gettempdir())
geography_db_file = tempdir / 'geography.db'
if geography_db_file.exists():
geography_db_file.unlink()
_ = urlretrieve(
'https://storage.googleapis.com/ibis-tutorial-data/geography.db',
geography_db_file,
)
import ibis
ibis.options.interactive = True
connection = ibis.sqlite.connect(geography_db_file)
Creating new tables from Ibis expressions¶
Suppose you have an Ibis expression that produces a table:
countries = connection.table('countries')
continent_name = (
countries.continent.case()
.when('AF', 'Africa')
.when('AN', 'Antarctica')
.when('AS', 'Asia')
.when('EU', 'Europe')
.when('NA', 'North America')
.when('OC', 'Oceania')
.when('SA', 'South America')
.else_(countries.continent)
.end()
.name('continent_name')
)
expr = countries[countries.continent, continent_name].distinct()
expr
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ continent ┃ continent_name ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ │ string │ string │ ├───────────┼────────────────┤ │ EU │ Europe │ │ AS │ Asia │ │ NA │ North America │ │ AF │ Africa │ │ AN │ Antarctica │ │ SA │ South America │ │ OC │ Oceania │ └───────────┴────────────────┘
To create a table in the database from the results of this expression, use the connection's create_table
method:
connection.create_table('continents', expr)
continents = connection.table('continents')
continents
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ continent ┃ continent_name ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ │ string │ string │ ├───────────┼────────────────┤ │ EU │ Europe │ │ AS │ Asia │ │ NA │ North America │ │ AF │ Africa │ │ AN │ Antarctica │ │ SA │ South America │ │ OC │ Oceania │ └───────────┴────────────────┘
Tables can be similarly dropped with drop_table
connection.drop_table('continents')
Creating new tables from in-memory Pandas dataframes¶
Pandas and NumPy are convenient to create test data in memory as a dataframe. This can then be turned into an Ibis expression using ibis.memtable
.
import pandas as pd
import numpy as np
def make_students_df(num_records, random_seed=None):
rng = np.random.default_rng(random_seed)
return pd.DataFrame(
{
"firstname": rng.choice(["Alice", "Bob", "Jane", "John"], size=num_records),
"birth_date": (
pd.to_datetime("2021-01-01")
+ pd.to_timedelta(rng.integers(0, 365, size=num_records), unit="D")
),
"math_grade": rng.normal(55, 10, size=num_records).clip(0, 100).round(1),
}
)
students_df = make_students_df(21, random_seed=42)
students_memtable = ibis.memtable(students_df)
students_memtable
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓ ┃ firstname ┃ birth_date ┃ math_grade ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩ │ string │ timestamp │ float64 │ ├───────────┼─────────────────────┼────────────┤ │ Alice │ 2021-05-16 00:00:00 │ 48.2 │ │ John │ 2021-03-08 00:00:00 │ 67.2 │ │ Jane │ 2021-12-05 00:00:00 │ 53.5 │ │ Bob │ 2021-10-13 00:00:00 │ 50.7 │ │ Bob │ 2021-08-24 00:00:00 │ 51.5 │ │ John │ 2021-05-27 00:00:00 │ 60.3 │ │ Alice │ 2021-10-28 00:00:00 │ 58.7 │ │ Jane │ 2021-07-19 00:00:00 │ 59.1 │ │ Alice │ 2021-06-11 00:00:00 │ 59.3 │ │ Alice │ 2021-06-14 00:00:00 │ 76.4 │ │ … │ … │ … │ └───────────┴─────────────────────┴────────────┘
By default ibis.memtable
uses the duckdb
in-memory backend to execute queries against the Pandas dataframe data efficiently.
We can then materialize it as a physical table for a specific backend if necessary:
students_db_file = tempdir / "ibis_tutorial_students.duckdb"
if students_db_file.exists():
students_db_file.unlink()
connection = ibis.duckdb.connect(students_db_file)
connection.create_table('students', students_memtable)
students = connection.table('students')
students.group_by(students.birth_date.month()).aggregate(
count=students.count(),
avg_math_grade=students.math_grade.mean(),
)
┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ ExtractMonth(birth_date) ┃ count ┃ avg_math_grade ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩ │ int32 │ int64 │ float64 │ ├──────────────────────────┼───────┼────────────────┤ │ 1 │ 1 │ 66.300000 │ │ 2 │ 1 │ 49.900000 │ │ 3 │ 3 │ 60.166667 │ │ 4 │ 1 │ 46.800000 │ │ 5 │ 2 │ 54.250000 │ │ 6 │ 2 │ 67.850000 │ │ 7 │ 2 │ 53.000000 │ │ 8 │ 2 │ 56.500000 │ │ 10 │ 4 │ 54.100000 │ │ 11 │ 2 │ 57.550000 │ │ … │ … │ … │ └──────────────────────────┴───────┴────────────────┘
Note that NumPy, Pandas and ibis.memtable
are only suitable to generate data that fits in memory. To generate data larger than memory, we can generate data in chunks and iteratively insert the chunks using connection.insert(tablename, pandas_dataframe)
:
connection.insert(students.get_name(), make_students_df(10_000, random_seed=43))
students.count()
10021
connection.insert(students.get_name(), make_students_df(10_000, random_seed=44))
students.count()
20021
students.group_by(students.birth_date.month()).aggregate(
count=students.count(),
avg_math_grade=students.math_grade.mean(),
)
┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ ExtractMonth(birth_date) ┃ count ┃ avg_math_grade ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩ │ int32 │ int64 │ float64 │ ├──────────────────────────┼───────┼────────────────┤ │ 1 │ 1751 │ 55.217133 │ │ 2 │ 1539 │ 55.189799 │ │ 3 │ 1717 │ 55.157833 │ │ 4 │ 1630 │ 54.603742 │ │ 5 │ 1728 │ 54.916319 │ │ 6 │ 1623 │ 54.700924 │ │ 7 │ 1710 │ 54.884503 │ │ 8 │ 1712 │ 55.278797 │ │ 9 │ 1622 │ 55.300062 │ │ 10 │ 1679 │ 55.220310 │ │ … │ … │ … │ └──────────────────────────┴───────┴────────────────┘
Inserting data into existing tables¶
Some backends support inserting data into existing tables from expressions. This can be done using connection.insert('table_name', expr)
.