More Value Expressions¶

Let's walk through some more value expressions.

Setup¶

In [1]:

            
                Copied!
                
!curl -LsS -o geography.db 'https://storage.googleapis.com/ibis-tutorial-data/geography.db'
!curl -LsS -o geography.db 'https://storage.googleapis.com/ibis-tutorial-data/geography.db'

In [2]:

            
                Copied!
                
import os
import tempfile

import ibis

ibis.options.interactive = True

connection = ibis.sqlite.connect(
    'geography.db'
)
import os
import tempfile

import ibis

ibis.options.interactive = True

connection = ibis.sqlite.connect(
    'geography.db'
)

Type casting¶

The Ibis type system supports the most common data types used in analytics, including support for nested types like lists, structs, and maps.

Type names can be used to cast from one type to another.

In [3]:

            
                Copied!
                
countries = connection.table('countries')
countries
countries = connection.table('countries')
countries

Out[3]:

┏━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━┓
┃ iso_alpha2 ┃ iso_alpha3 ┃ iso_numeric ┃ fips   ┃ name                 ┃ capital          ┃ area_km2     ┃ … ┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━┩
│ string     │ string     │ int32       │ string │ string               │ string           │ float64      │ … │
├────────────┼────────────┼─────────────┼────────┼──────────────────────┼──────────────────┼──────────────┼───┤
│ AD         │ AND        │          20 │ AN     │ Andorra              │ Andorra la Vella │ 4.680000e+02 │ … │
│ AE         │ ARE        │         784 │ AE     │ United Arab Emirates │ Abu Dhabi        │ 8.288000e+04 │ … │
│ AF         │ AFG        │           4 │ AF     │ Afghanistan          │ Kabul            │ 6.475000e+05 │ … │
│ AG         │ ATG        │          28 │ AC     │ Antigua and Barbuda  │ St. Johns        │ 4.430000e+02 │ … │
│ AI         │ AIA        │         660 │ AV     │ Anguilla             │ The Valley       │ 1.020000e+02 │ … │
│ AL         │ ALB        │           8 │ AL     │ Albania              │ Tirana           │ 2.874800e+04 │ … │
│ AM         │ ARM        │          51 │ AM     │ Armenia              │ Yerevan          │ 2.980000e+04 │ … │
│ AN         │ ANT        │         530 │ NT     │ Netherlands Antilles │ Willemstad       │ 9.600000e+02 │ … │
│ AO         │ AGO        │          24 │ AO     │ Angola               │ Luanda           │ 1.246700e+06 │ … │
│ AQ         │ ATA        │          10 │ AY     │ Antarctica           │ ~                │ 1.400000e+07 │ … │
│ …          │ …          │           … │ …      │ …                    │ …                │            … │ … │
└────────────┴────────────┴─────────────┴────────┴──────────────────────┴──────────────────┴──────────────┴───┘

In [4]:

            
                Copied!
                
countries = connection.table('countries')
countries.population.cast('float').sum()
countries = connection.table('countries')
countries.population.cast('float').sum()

Out[4]:

6878963738.0

In [5]:

            
                Copied!
                
countries.area_km2.cast('int32').sum()
countries.area_km2.cast('int32').sum()

Out[5]:

150012536

Case / if-then-else expressions¶

We support a number of variants of the SQL-equivalent CASE expression, and will add more API functions over time to meet different use cases and enhance the expressiveness of any branching-based value logic.

In [6]:

            
                Copied!
                
                    
                    
                
                

        
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AN', 'Antarctica')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .else_(countries.continent)
    .end()
    .name('continent_name')
)

expr.value_counts()
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AN', 'Antarctica')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .else_(countries.continent)
    .end()
    .name('continent_name')
)

expr.value_counts()

Out[6]:

┏━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ continent_name ┃ count ┃
┡━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ string         │ int64 │
├────────────────┼───────┤
│ Africa         │    58 │
│ Antarctica     │     5 │
│ Asia           │    51 │
│ Europe         │    54 │
│ North America  │    42 │
│ Oceania        │    28 │
│ South America  │    14 │
└────────────────┴───────┘

If the else_ default condition is not provided, any values not matching one of the conditions will be NULL.

In [7]:

            
                Copied!
                
                    
                    
                
                

        
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .end()
    .name('continent_name_with_nulls')
)

expr.value_counts()
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('AS', 'Asia')
    .when('EU', 'Europe')
    .when('NA', 'North America')
    .when('OC', 'Oceania')
    .when('SA', 'South America')
    .end()
    .name('continent_name_with_nulls')
)

expr.value_counts()

Out[7]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ continent_name_with_nulls ┃ count ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ string                    │ int64 │
├───────────────────────────┼───────┤
│ ∅                         │     5 │
│ Africa                    │    58 │
│ Asia                      │    51 │
│ Europe                    │    54 │
│ North America             │    42 │
│ Oceania                   │    28 │
│ South America             │    14 │
└───────────────────────────┴───────┘

To test for an arbitrary series of boolean conditions, use the case API method and pass any boolean expressions potentially involving columns of the table:

In [8]:

            
                Copied!
                
                    
                    
                
                

        
expr = (
    ibis.case()
    .when(countries.population > 25_000_000, 'big')
    .when(countries.population < 5_000_000, 'small')
    .else_('medium')
    .end()
    .name('size')
)

countries['name', 'population', expr].limit(10)
expr = (
    ibis.case()
    .when(countries.population > 25_000_000, 'big')
    .when(countries.population < 5_000_000, 'small')
    .else_('medium')
    .end()
    .name('size')
)

countries['name', 'population', expr].limit(10)

Out[8]:

┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
┃ name                 ┃ population ┃ size   ┃
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
│ string               │ int32      │ string │
├──────────────────────┼────────────┼────────┤
│ Andorra              │      84000 │ small  │
│ United Arab Emirates │    4975593 │ small  │
│ Afghanistan          │   29121286 │ big    │
│ Antigua and Barbuda  │      86754 │ small  │
│ Anguilla             │      13254 │ small  │
│ Albania              │    2986952 │ small  │
│ Armenia              │    2968000 │ small  │
│ Netherlands Antilles │     300000 │ small  │
│ Angola               │   13068161 │ medium │
│ Antarctica           │          0 │ small  │
└──────────────────────┴────────────┴────────┘

Simple ternary-cases (like the Python X if COND else Y) can be written using the ifelse function:

In [9]:

            
                Copied!
                
expr = (countries.continent == 'AS').ifelse('Asia', 'Not Asia').name('is_asia')

countries['name', 'continent', expr].limit(10)
expr = (countries.continent == 'AS').ifelse('Asia', 'Not Asia').name('is_asia')

countries['name', 'continent', expr].limit(10)

Out[9]:

┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
┃ name                 ┃ continent ┃ is_asia  ┃
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
│ string               │ string    │ string   │
├──────────────────────┼───────────┼──────────┤
│ Andorra              │ EU        │ Not Asia │
│ United Arab Emirates │ AS        │ Asia     │
│ Afghanistan          │ AS        │ Asia     │
│ Antigua and Barbuda  │ NA        │ Not Asia │
│ Anguilla             │ NA        │ Not Asia │
│ Albania              │ EU        │ Not Asia │
│ Armenia              │ AS        │ Asia     │
│ Netherlands Antilles │ NA        │ Not Asia │
│ Angola               │ AF        │ Not Asia │
│ Antarctica           │ AN        │ Not Asia │
└──────────────────────┴───────────┴──────────┘

Set membership¶

The isin and notin functions are like their pandas counterparts. These can take:

A list of value expressions, either literal values or other column expressions
An array/column expression of some kind

In [10]:

            
                Copied!
                
is_america = countries.continent.isin(['NA', 'SA'])
countries[is_america].continent.value_counts()
is_america = countries.continent.isin(['NA', 'SA'])
countries[is_america].continent.value_counts()

Out[10]:

┏━━━━━━━━━━━┳━━━━━━━┓
┃ continent ┃ count ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ string    │ int64 │
├───────────┼───────┤
│ NA        │    42 │
│ SA        │    14 │
└───────────┴───────┘

You can also check for membership in an array. Here is an example of filtering based on the top 3 (ignoring ties) most frequently-occurring values in the string_col column of alltypes:

In [11]:

            
                Copied!
                
top_continents = countries.continent.value_counts().limit(3).continent
top_continents_filter = countries.continent.isin(top_continents)
expr = countries[top_continents_filter]

expr.count()
top_continents = countries.continent.value_counts().limit(3).continent
top_continents_filter = countries.continent.isin(top_continents)
expr = countries[top_continents_filter]

expr.count()

Out[11]:

This is a common enough operation that we provide a special analytical filter function topk:

In [12]:

            
                Copied!
                
countries.continent.topk(3)
countries.continent.topk(3)

Out[12]:

┏━━━━━━━━━━━┳━━━━━━━┓
┃ continent ┃ count ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ string    │ int64 │
├───────────┼───────┤
│ AF        │    58 │
│ EU        │    54 │
│ AS        │    51 │
└───────────┴───────┘

Cool, huh? More on topk later.

Null Checking¶

Like their pandas equivalents, the isnull and notnull functions return True values if the values are null, or non-null, respectively. For example:

In [13]:

            
                Copied!
                
                    
                    
                
                

        
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('EU', 'Europe')
    .when('AS', 'Asia')
    .end()
    .name('top_continent_name')
)

expr.isnull().value_counts()
expr = (
    countries.continent.case()
    .when('AF', 'Africa')
    .when('EU', 'Europe')
    .when('AS', 'Asia')
    .end()
    .name('top_continent_name')
)

expr.isnull().value_counts()

Out[13]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ IsNull(top_continent_name) ┃ count ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ boolean                    │ int64 │
├────────────────────────────┼───────┤
│ False                      │   163 │
│ True                       │    89 │
└────────────────────────────┴───────┘

Functions like isnull can be combined with case expressions or functions like ifelse to replace null values with some other value. ifelse here will use the first value supplied for any True value and the second value for any False value. Either value can be a scalar or array.

In [14]:

            
                Copied!
                
expr2 = expr.isnull().ifelse('Other continent', expr).name('continent')
expr2.value_counts()
expr2 = expr.isnull().ifelse('Other continent', expr).name('continent')
expr2.value_counts()

Out[14]:

┏━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ continent       ┃ count ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ string          │ int64 │
├─────────────────┼───────┤
│ Africa          │    58 │
│ Asia            │    51 │
│ Europe          │    54 │
│ Other continent │    89 │
└─────────────────┴───────┘

Distinct-based operations¶

Ibis supports using distinct to remove duplicate rows or values on tables or arrays. For example:

In [15]:

            
                Copied!
                
countries[['continent']].distinct()
countries[['continent']].distinct()

Out[15]:

┏━━━━━━━━━━━┓
┃ continent ┃
┡━━━━━━━━━━━┩
│ string    │
├───────────┤
│ EU        │
│ AS        │
│ NA        │
│ AF        │
│ AN        │
│ SA        │
│ OC        │
└───────────┘

This can be combined with count to form a reduction metric:

In [16]:

            
                Copied!
                
metric = countries[['continent']].distinct().count().name('num_continents')
metric
metric = countries[['continent']].distinct().count().name('num_continents')
metric

Out[16]:

String operations¶

What's supported is pretty basic right now. We intend to support the full gamut of regular expression munging with a nice API, though in some cases some work will be required on SQLite's backend to support everything.

In [17]:

            
                Copied!
                
countries[['name']].limit(5)
countries[['name']].limit(5)

Out[17]:

┏━━━━━━━━━━━━━━━━━━━━━━┓
┃ name                 ┃
┡━━━━━━━━━━━━━━━━━━━━━━┩
│ string               │
├──────────────────────┤
│ Andorra              │
│ United Arab Emirates │
│ Afghanistan          │
│ Antigua and Barbuda  │
│ Anguilla             │
└──────────────────────┘

At the moment, basic substring operations (substr, with conveniences left and right) and Python-like APIs such as lower and upper (for case normalization) are supported. So you could count first letter occurrences in a string column like so:

In [18]:

            
                Copied!
                
expr = countries.name.lower().left(1).name('first_letter')
expr.value_counts().order_by(('count', False)).limit(10)
expr = countries.name.lower().left(1).name('first_letter')
expr.value_counts().order_by(('count', False)).limit(10)

Out[18]:

┏━━━━━━━━━━━━━━┳━━━━━━━┓
┃ first_letter ┃ count ┃
┡━━━━━━━━━━━━━━╇━━━━━━━┩
│ string       │ int64 │
├──────────────┼───────┤
│ s            │    34 │
│ b            │    22 │
│ m            │    22 │
│ c            │    21 │
│ g            │    16 │
│ n            │    16 │
│ a            │    15 │
│ t            │    14 │
│ p            │    12 │
│ i            │    10 │
└──────────────┴───────┘

For fuzzy and regex filtering/searching, you can use one of the following

like, works as the SQL LIKE keyword
rlike, like re.search or SQL RLIKE
contains, like x in str_value in Python

In [19]:

            
                Copied!
                
countries[countries.name.like('%GE%')].name
countries[countries.name.like('%GE%')].name

Out[19]:

┏━━━━━━━━━━━━━━━━━━━━━━┓
┃ name                 ┃
┡━━━━━━━━━━━━━━━━━━━━━━┩
│ string               │
└──────────────────────┘

In [20]:

            
                Copied!
                
countries[countries.name.lower().rlike('.*ge.*')].name
countries[countries.name.lower().rlike('.*ge.*')].name

Out[20]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ name                                     ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string                                   │
├──────────────────────────────────────────┤
│ Argentina                                │
│ Germany                                  │
│ Algeria                                  │
│ Georgia                                  │
│ South Georgia and South Sandwich Islands │
│ Niger                                    │
│ Nigeria                                  │
└──────────────────────────────────────────┘

In [21]:

            
                Copied!
                
countries[countries.name.lower().contains('ge')].name
countries[countries.name.lower().contains('ge')].name

Out[21]:

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ name                                     ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string                                   │
├──────────────────────────────────────────┤
│ Argentina                                │
│ Germany                                  │
│ Algeria                                  │
│ Georgia                                  │
│ South Georgia and South Sandwich Islands │
│ Niger                                    │
│ Nigeria                                  │
└──────────────────────────────────────────┘

Timestamp operations¶

Date and time functionality is relatively limited at present compared with pandas, but we'll get there. The main things we have right now are

Field access (year, month, day, ...)
Timedeltas
Comparisons with fixed timestamps

In [22]:

            
                Copied!
                
independence = connection.table('independence')

independence[
    independence.independence_date,
    independence.independence_date.month().name('month'),
].limit(10)
independence = connection.table('independence')

independence[
    independence.independence_date,
    independence.independence_date.month().name('month'),
].limit(10)

Out[22]:

┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ independence_date ┃ month ┃
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ date              │ int32 │
├───────────────────┼───────┤
│ 1919-08-19        │     8 │
│ 1912-11-28        │    11 │
│ 1962-07-05        │     7 │
│ 1975-11-11        │    11 │
│ 1981-11-01        │    11 │
│ 1816-07-09        │     7 │
│ 1918-05-28        │     5 │
│ 1991-09-21        │     9 │
│ 1955-10-26        │    10 │
│ 1918-05-28        │     5 │
└───────────────────┴───────┘

Somewhat more comprehensively

In [23]:

            
                Copied!
                
                    
                    
                
                

        
def get_field(f):
    return getattr(independence.independence_date, f)().name(f)


fields = [
    'year',
    'month',
    'day',
]  # datetime fields can also use: 'hour', 'minute', 'second', 'millisecond'
projection = [independence.independence_date] + [get_field(x) for x in fields]
independence[projection].limit(10)
def get_field(f):
    return getattr(independence.independence_date, f)().name(f)


fields = [
    'year',
    'month',
    'day',
]  # datetime fields can also use: 'hour', 'minute', 'second', 'millisecond'
projection = [independence.independence_date] + [get_field(x) for x in fields]
independence[projection].limit(10)

Out[23]:

┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓
┃ independence_date ┃ year  ┃ month ┃ day   ┃
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩
│ date              │ int32 │ int32 │ int32 │
├───────────────────┼───────┼───────┼───────┤
│ 1919-08-19        │  1919 │     8 │    19 │
│ 1912-11-28        │  1912 │    11 │    28 │
│ 1962-07-05        │  1962 │     7 │     5 │
│ 1975-11-11        │  1975 │    11 │    11 │
│ 1981-11-01        │  1981 │    11 │     1 │
│ 1816-07-09        │  1816 │     7 │     9 │
│ 1918-05-28        │  1918 │     5 │    28 │
│ 1991-09-21        │  1991 │     9 │    21 │
│ 1955-10-26        │  1955 │    10 │    26 │
│ 1918-05-28        │  1918 │     5 │    28 │
└───────────────────┴───────┴───────┴───────┘

For timestamp arithmetic and comparisons, check out functions in the top level ibis namespace. This include things like day and second, but also the ibis.timestamp function:

In [24]:

            
                Copied!
                
independence[
    independence.independence_date.min(),
    independence.independence_date.max(),
    independence.count().name('nrows'),
].distinct()
independence[
    independence.independence_date.min(),
    independence.independence_date.max(),
    independence.count().name('nrows'),
].distinct()

Out[24]:

┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ Min(independence_date) ┃ Max(independence_date) ┃ nrows ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ date                   │ date                   │ int64 │
├────────────────────────┼────────────────────────┼───────┤
│ 1291-08-01             │ 2011-07-09             │   186 │
└────────────────────────┴────────────────────────┴───────┘

In [25]:

            
                Copied!
                
independence[independence.independence_date > '2000-01-01'].count()
independence[independence.independence_date > '2000-01-01'].count()

Out[25]:

Some backends support adding offsets. For example:

independence.independence_date + ibis.interval(days=1)
ibis.now() - independence.independence_date

Last update: January 5, 2023