Creating and Inserting Data¶

Setup¶

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import os
import tempfile
from pathlib import Path
from urllib.request import urlretrieve

tempdir = Path(tempfile.gettempdir())
geography_db_file = tempdir / 'geography.db'

if geography_db_file.exists():
    geography_db_file.unlink()

_ = urlretrieve(
    'https://storage.googleapis.com/ibis-tutorial-data/geography.db',
    geography_db_file,
)
import os
import tempfile
from pathlib import Path
from urllib.request import urlretrieve

tempdir = Path(tempfile.gettempdir())
geography_db_file = tempdir / 'geography.db'

if geography_db_file.exists():
    geography_db_file.unlink()

_ = urlretrieve(
    'https://storage.googleapis.com/ibis-tutorial-data/geography.db',
    geography_db_file,
)

In [2]:

            
                Copied!
                
import ibis

ibis.options.interactive = True

connection = ibis.sqlite.connect(geography_db_file)
import ibis

ibis.options.interactive = True

connection = ibis.sqlite.connect(geography_db_file)

Creating new tables from Ibis expressions¶

Suppose you have an Ibis expression that produces a table:

In [3]:

            
                Copied!
                
                    
                    
                
                

        
countries = connection.table('countries')

continent_name = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AN', 'Antarctica')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .else_(countries.continent)
    .end()
    .name('continent_name')
)

expr = countries[countries.continent, continent_name].distinct()
expr
countries = connection.table('countries')

continent_name = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AN', 'Antarctica')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .else_(countries.continent)
    .end()
    .name('continent_name')
)

expr = countries[countries.continent, continent_name].distinct()
expr

Out[3]:

┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ continent ┃ continent_name ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ string    │ string         │
├───────────┼────────────────┤
│ EU        │ Europe         │
│ AS        │ Asia           │
│ NA        │ North America  │
│ AF        │ Africa         │
│ AN        │ Antarctica     │
│ SA        │ South America  │
│ OC        │ Oceania        │
└───────────┴────────────────┘

To create a table in the database from the results of this expression, use the connection's create_table method:

In [4]:

            
                Copied!
                
connection.create_table('continents', expr)
connection.create_table('continents', expr)

In [5]:

            
                Copied!
                
continents = connection.table('continents')
continents
continents = connection.table('continents')
continents

Out[5]:

┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ continent ┃ continent_name ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ string    │ string         │
├───────────┼────────────────┤
│ EU        │ Europe         │
│ AS        │ Asia           │
│ NA        │ North America  │
│ AF        │ Africa         │
│ AN        │ Antarctica     │
│ SA        │ South America  │
│ OC        │ Oceania        │
└───────────┴────────────────┘

Tables can be similarly dropped with drop_table

In [6]:

            
                Copied!
                
connection.drop_table('continents')
connection.drop_table('continents')

Creating new tables from in-memory Pandas dataframes¶

Pandas and NumPy are convenient to create test data in memory as a dataframe. This can then be turned into an Ibis expression using ibis.memtable.

In [7]:

            
                Copied!
                
                    
                    
                
                

        
import pandas as pd
import numpy as np


def make_students_df(num_records, random_seed=None):
    rng = np.random.default_rng(random_seed)
    return pd.DataFrame(
        {
            "firstname": rng.choice(["Alice", "Bob", "Jane", "John"], size=num_records),
            "birth_date": (
                pd.to_datetime("2021-01-01")
                + pd.to_timedelta(rng.integers(0, 365, size=num_records), unit="D")
            ),
            "math_grade": rng.normal(55, 10, size=num_records).clip(0, 100).round(1),
        }
    )

students_df = make_students_df(21, random_seed=42)
students_memtable = ibis.memtable(students_df)
students_memtable
import pandas as pd
import numpy as np


def make_students_df(num_records, random_seed=None):
    rng = np.random.default_rng(random_seed)
    return pd.DataFrame(
        {
            "firstname": rng.choice(["Alice", "Bob", "Jane", "John"], size=num_records),
            "birth_date": (
                pd.to_datetime("2021-01-01")
                + pd.to_timedelta(rng.integers(0, 365, size=num_records), unit="D")
            ),
            "math_grade": rng.normal(55, 10, size=num_records).clip(0, 100).round(1),
        }
    )

students_df = make_students_df(21, random_seed=42)
students_memtable = ibis.memtable(students_df)
students_memtable

Out[7]:

┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ firstname ┃ birth_date          ┃ math_grade ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ string    │ timestamp           │ float64    │
├───────────┼─────────────────────┼────────────┤
│ Alice     │ 2021-05-16 00:00:00 │       48.2 │
│ John      │ 2021-03-08 00:00:00 │       67.2 │
│ Jane      │ 2021-12-05 00:00:00 │       53.5 │
│ Bob       │ 2021-10-13 00:00:00 │       50.7 │
│ Bob       │ 2021-08-24 00:00:00 │       51.5 │
│ John      │ 2021-05-27 00:00:00 │       60.3 │
│ Alice     │ 2021-10-28 00:00:00 │       58.7 │
│ Jane      │ 2021-07-19 00:00:00 │       59.1 │
│ Alice     │ 2021-06-11 00:00:00 │       59.3 │
│ Alice     │ 2021-06-14 00:00:00 │       76.4 │
│ …         │ …                   │          … │
└───────────┴─────────────────────┴────────────┘

By default ibis.memtable uses the duckdb in-memory backend to execute queries against the Pandas dataframe data efficiently.

We can then materialize it as a physical table for a specific backend if necessary:

In [8]:

            
                Copied!
                
                    
                    
                
                

        
students_db_file = tempdir / "ibis_tutorial_students.duckdb"
if students_db_file.exists():
    students_db_file.unlink()

connection = ibis.duckdb.connect(students_db_file)
connection.create_table('students', students_memtable)
students = connection.table('students')
students.group_by(students.birth_date.month()).aggregate(
    count=students.count(),
    avg_math_grade=students.math_grade.mean(),
)
students_db_file = tempdir / "ibis_tutorial_students.duckdb"
if students_db_file.exists():
    students_db_file.unlink()

connection = ibis.duckdb.connect(students_db_file)
connection.create_table('students', students_memtable)
students = connection.table('students')
students.group_by(students.birth_date.month()).aggregate(
    count=students.count(),
    avg_math_grade=students.math_grade.mean(),
)

Out[8]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ ExtractMonth(birth_date) ┃ count ┃ avg_math_grade ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ int32                    │ int64 │ float64        │
├──────────────────────────┼───────┼────────────────┤
│                        1 │     1 │      66.300000 │
│                        2 │     1 │      49.900000 │
│                        3 │     3 │      60.166667 │
│                        4 │     1 │      46.800000 │
│                        5 │     2 │      54.250000 │
│                        6 │     2 │      67.850000 │
│                        7 │     2 │      53.000000 │
│                        8 │     2 │      56.500000 │
│                       10 │     4 │      54.100000 │
│                       11 │     2 │      57.550000 │
│                        … │     … │              … │
└──────────────────────────┴───────┴────────────────┘

Note that NumPy, Pandas and ibis.memtable are only suitable to generate data that fits in memory. To generate data larger than memory, we can generate data in chunks and iteratively insert the chunks using connection.insert(tablename, pandas_dataframe):

In [9]:

            
                Copied!
                
connection.insert(students.get_name(), make_students_df(10_000, random_seed=43))
students.count()
connection.insert(students.get_name(), make_students_df(10_000, random_seed=43))
students.count()

Out[9]:

In [10]:

            
                Copied!
                
connection.insert(students.get_name(), make_students_df(10_000, random_seed=44))
students.count()
connection.insert(students.get_name(), make_students_df(10_000, random_seed=44))
students.count()

Out[10]:

In [11]:

            
                Copied!
                
students.group_by(students.birth_date.month()).aggregate(
    count=students.count(),
    avg_math_grade=students.math_grade.mean(),
)
students.group_by(students.birth_date.month()).aggregate(
    count=students.count(),
    avg_math_grade=students.math_grade.mean(),
)

Out[11]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ ExtractMonth(birth_date) ┃ count ┃ avg_math_grade ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ int32                    │ int64 │ float64        │
├──────────────────────────┼───────┼────────────────┤
│                        1 │  1751 │      55.217133 │
│                        2 │  1539 │      55.189799 │
│                        3 │  1717 │      55.157833 │
│                        4 │  1630 │      54.603742 │
│                        5 │  1728 │      54.916319 │
│                        6 │  1623 │      54.700924 │
│                        7 │  1710 │      54.884503 │
│                        8 │  1712 │      55.278797 │
│                        9 │  1622 │      55.300062 │
│                       10 │  1679 │      55.220310 │
│                        … │     … │              … │
└──────────────────────────┴───────┴────────────────┘

Inserting data into existing tables¶

Some backends support inserting data into existing tables from expressions. This can be done using connection.insert('table_name', expr).

Last update: October 19, 2022