Tabular core

Basic function to preprocess tabular data before assembling it in a DataLoaders.

Initial preprocessing


source

make_date


def make_date(
    df, date_field
):

Make sure df[date_field] is of the right date type.

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[us]'))

source

add_datepart


def add_datepart(
    df, field_name, prefix:NoneType=None, drop:bool=True, time:bool=False
):

Helper function that adds columns relevant to a date in the column field_name of df.

For example if we have a series of dates we can then generate features such as Year, Month, Day, Dayofweek, Is_month_start, etc as shown below:

df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df.head()
Year Month Week Day Dayofweek Dayofyear Is_month_end Is_month_start Is_quarter_end Is_quarter_start Is_year_end Is_year_start Elapsed
0 2019.0 12.0 49.0 4.0 2.0 338.0 False False False False False False 1575417.0
1 NaN NaN NaN NaN NaN NaN False False False False False False NaN
2 2019.0 11.0 46.0 15.0 4.0 319.0 False False False False False False 1573776.0
3 2019.0 10.0 43.0 24.0 3.0 297.0 False False False False False False 1571875.0

source

add_elapsed_times


def add_elapsed_times(
    df, field_names, date_field, base_field
):

Add in df for each event in field_names the elapsed time according to date_field grouped by base_field

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df.head()
date event base Afterevent Beforeevent event_bw event_fw
0 2019-12-04 False 1 5 0 1.0 0.0
1 2019-11-29 True 1 0 0 1.0 1.0
2 2019-11-15 False 2 22 0 1.0 0.0
3 2019-10-24 True 2 0 0 1.0 1.0

source

cont_cat_split


def cont_cat_split(
    df, max_card:int=20, dep_var:NoneType=None
):

Helper function that returns column names of cont and cat variables from given df.

This function works by determining if a column is continuous or categorical based on the cardinality of its values. If it is above the max_card parameter (or a float datatype) then it will be added to the cont_names else cat_names. An example is below:

# Example with simple numpy types
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],
                   'i8': pd.Series([1, 2, 3, 4], dtype='int8'),
                   'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),
                   'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
                   'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
cont_names, cat_names = cont_cat_split(df)
cont_names: ['cont1', 'f16']
cat_names: ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']`
# Example with pandas types and generated columns
df = pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
                    'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
                    'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
                    'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
                    'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
                    })
df = add_datepart(df, 'd1_date', drop=False)
df['cat1'] = df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True)
cont_names, cat_names = cont_cat_split(df, max_card=0)
cont_names: ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed']
cat_names: ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']

source

df_shrink_dtypes


def df_shrink_dtypes(
    df, skip:list=[], obj2cat:bool=True, int2uint:bool=False
):

Return any possible smaller data types for DataFrame columns. Allows object->category, int->uint, and exclusion.

For example we will make a sample DataFrame with int, float, bool, and object datatypes:

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
                   'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypes
i         int64
f       float64
e          bool
date        str
dtype: object

We can then call df_shrink_dtypes to find the smallest possible datatype that can support the data:

dt = df_shrink_dtypes(df)
dt
{'i': dtype('int8'), 'f': dtype('float32'), 'date': 'category'}

source

df_shrink


def df_shrink(
    df, skip:list=[], obj2cat:bool=True, int2uint:bool=False
):

Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes().

df_shrink(df) attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:

  • boolean, category, datetime64[ns] dtype columns are ignored.
  • ‘object’ type columns are categorified, which can save a lot of memory in large dataset. It can be turned off by obj2cat=False.
  • int2uint=True, to fit int types to uint types, if all data in the column is >= 0.
  • columns can be excluded by name using excl_cols=['col1','col2'].

To get only new column data types without actually casting a DataFrame, use df_shrink_dtypes() with all the same parameters for df_shrink().

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
                  'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])

Let’s compare the two:

df.dtypes
i         int64
f       float64
u         int64
date        str
dtype: object
df2.dtypes
i          int8
f       float32
u         int16
date        str
dtype: object

We can see that the datatypes changed, and even further we can look at their relative memory usages:

Initial Dataframe: 228 bytes
Reduced Dataframe: 177 bytes

Here’s another example using the ADULT_SAMPLE dataset:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)
Initial Dataframe: 3.907452 megabytes
Reduced Dataframe: 0.814989 megabytes

We reduced the overall memory used by 79%!


source

Tabular


def Tabular(
    df, procs:NoneType=None, cat_names:NoneType=None, cont_names:NoneType=None, y_names:NoneType=None,
    y_block:NoneType=None, splits:NoneType=None, do_setup:bool=True, device:NoneType=None, inplace:bool=False,
    reduce_memory:bool=True
):

A DataFrame wrapper that knows which cols are cont/cat/y, and returns rows in __getitem__

  • df: A DataFrame of your data
  • cat_names: Your categorical x variables
  • cont_names: Your continuous x variables
  • y_names: Your dependent y variables
    • Note: Mixed y’s such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
  • y_block: How to sub-categorize the type of y_names (CategoryBlock or RegressionBlock)
  • splits: How to split your data
  • do_setup: A parameter for if Tabular will run the data through the procs upon initialization
  • device: cuda or cpu
  • inplace: If True, Tabular will not keep a separate copy of your original DataFrame in memory. You should ensure pd.options.mode.chained_assignment is None before setting this
  • reduce_memory: fastai will attempt to reduce the overall memory usage by the inputted DataFrame with df_shrink

source

TabularPandas


def TabularPandas(
    df, procs:NoneType=None, cat_names:NoneType=None, cont_names:NoneType=None, y_names:NoneType=None,
    y_block:NoneType=None, splits:NoneType=None, do_setup:bool=True, device:NoneType=None, inplace:bool=False,
    reduce_memory:bool=True
):

A Tabular object with transforms


source

TabularProc


def TabularProc(
    enc:NoneType=None, dec:NoneType=None, split_idx:NoneType=None, order:NoneType=None
):

Base class to write a non-lazy tabular processor for dataframes

These transforms are applied as soon as the data is available rather than as data is called from the DataLoader


source

Categorify


def Categorify(
    enc:NoneType=None, dec:NoneType=None, split_idx:NoneType=None, order:NoneType=None
):

Transform the categorical variables to something similar to pd.Categorical

While visually in the DataFrame you will not see a change, the classes are stored in to.procs.categorify as we can see below on a dummy DataFrame:

df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
to.show()
a
0 0
1 1
2 2
3 0
4 2

Each column’s unique values are stored in a dictionary of column:[values]:

cat = to.procs.categorify
cat.classes
{'a': ['#na#', np.int8(0), np.int8(1), np.int8(2)]}

source

FillStrategy


def FillStrategy(
    args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):

Namespace containing the various filling strategies.

Currently, filling with the median, a constant, and the mode are supported.


source

FillMissing


def FillMissing(
    fill_strategy:function=median, add_col:bool=True, fill_vals:NoneType=None
):

Fill the missing values in continuous columns.


source

ReadTabBatch


def ReadTabBatch(
    to
):

Transform TabularPandas values into a Tensor with the ability to decode


source

TabDataLoader


def TabDataLoader(
    dataset, # Map- or iterable-style dataset from which to load the data
    bs:int=16, # Size of batch
    shuffle:bool=False, # Whether to shuffle data
    after_batch:NoneType=None, num_workers:int=0, verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
    n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
    wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
    after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None,
    retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None,
    do_batch:NoneType=None
):

A transformed DataLoader for Tabular data


source

TabWeightedDL


def TabWeightedDL(
    dataset, # Map- or iterable-style dataset from which to load the data
    bs:int=16, # Size of batch
    wgts:NoneType=None, shuffle:bool=False, # Whether to shuffle data
    after_batch:NoneType=None, num_workers:int=0, verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
    n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
    wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
    after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None,
    retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None,
    do_batch:NoneType=None
):

A transformed DataLoader for Tabular Weighted data

Integration example

For a more in-depth explanation, see the tabular tutorial

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country salary
0 49 Private 101320 Assoc-acdm 12.0 Married-civ-spouse NaN Wife White Female 0 1902 40 United-States >=50k
1 44 Private 236746 Masters 14.0 Divorced Exec-managerial Not-in-family White Male 10520 0 45 United-States >=50k
2 38 Private 96185 HS-grad NaN Divorced NaN Unmarried Black Female 0 0 32 United-States <50k
3 38 Self-emp-inc 112847 Prof-school 15.0 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 United-States >=50k
4 42 Self-emp-not-inc 82297 7th-8th NaN Married-civ-spouse Other-service Wife Black Female 0 0 50 United-States <50k
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)
dls = to.dataloaders()
dls.valid.show_batch()
/Users/jhoward/aai-ws/fastai/fastai/torch_core.py:154: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:212.)
  else as_tensor(x.values, **kwargs) if isinstance(x, (pd.Series, pd.DataFrame))
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num salary
0 State-gov Bachelors Never-married Prof-specialty Not-in-family White False 23.000000 287987.999279 13.0 >=50k
1 State-gov Some-college Never-married Tech-support Unmarried Black False 32.000000 131587.998019 10.0 <50k
2 Private HS-grad Married-civ-spouse Other-service Husband White False 58.000000 183810.000021 9.0 <50k
3 Self-emp-inc Some-college Married-civ-spouse Sales Husband White False 83.999998 172906.999298 10.0 >=50k
4 Local-gov Bachelors Never-married Sales Own-child White False 31.000000 128016.002079 13.0 <50k
5 Private Some-college Never-married Adm-clerical Not-in-family White False 22.000001 58915.997617 10.0 <50k
6 ? 11th Never-married ? Not-in-family White False 26.000000 176966.999519 7.0 <50k
7 Private Bachelors Married-civ-spouse Prof-specialty Other-relative Asian-Pac-Islander False 23.999999 86744.998003 13.0 <50k
8 Private Some-college Never-married Other-service Not-in-family White False 38.000000 123833.001583 10.0 <50k
9 Private 11th Married-civ-spouse Farming-fishing Husband White False 35.000000 168322.000103 7.0 <50k
to.show()
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num salary
3564 Self-emp-not-inc HS-grad Married-civ-spouse Craft-repair Husband White False 50.0 124793.0 9.0 <50k
5802 Private HS-grad Never-married Adm-clerical Own-child White False 30.0 205204.0 9.0 <50k
1087 Self-emp-not-inc HS-grad Married-civ-spouse Farming-fishing Husband White False 55.0 149168.0 9.0 <50k
1239 Private HS-grad Never-married Other-service Own-child White False 25.0 104193.0 9.0 <50k
2234 Private HS-grad Never-married Handlers-cleaners Other-relative White False 22.0 361138.0 9.0 <50k
8032 Private Bachelors Married-civ-spouse Sales Husband White False 54.0 391016.0 13.0 >=50k
4364 Private HS-grad Married-civ-spouse Craft-repair Husband White False 45.0 380922.0 9.0 >=50k
5404 Self-emp-inc HS-grad Married-civ-spouse Farming-fishing Husband White False 60.0 160062.0 9.0 <50k
693 State-gov Masters Never-married Prof-specialty Not-in-family Asian-Pac-Islander True 27.0 315640.0 10.0 <50k
2684 Self-emp-not-inc Bachelors Married-civ-spouse Farming-fishing Husband White False 84.0 155057.0 13.0 <50k

We can decode any set of transformed data by calling to.decode_row with our raw data:

row = to.items.iloc[0]
to.decode_row(row)
age                                50.0
workclass              Self-emp-not-inc
fnlwgt                         124793.0
education                       HS-grad
education-num                       9.0
marital-status       Married-civ-spouse
occupation                 Craft-repair
relationship                    Husband
race                              White
sex                                Male
capital-gain                          0
capital-loss                          0
hours-per-week                       30
native-country            United-States
salary                             <50k
education-num_na                  False
Name: 3564, dtype: object

We can make new test datasets based on the training data with the to.new()

Note

Since machine learning models can’t magically understand categories it was never trained on, the data should reflect this. If there are different missing values in your test data you should address this before training

to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country education-num_na
10000 0.470185 5 1.346422 10 1.165893 3 2 1 2 Male 0 0 40 Philippines 1
10001 -0.923825 5 1.258980 12 -0.424752 3 15 1 4 Male 0 0 40 United-States 1
10002 1.057137 5 0.150986 2 -1.220074 1 9 2 5 Female 0 0 37 United-States 1
10003 0.543554 5 -0.284206 12 -0.424752 7 2 5 5 Female 0 0 43 United-States 1
10004 0.763661 6 1.449451 9 0.370571 3 5 1 5 Male 0 0 60 United-States 1

We can then convert it to a DataLoader:

tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num
0 Private Bachelors Married-civ-spouse Adm-clerical Husband Asian-Pac-Islander False 45.0 338104.995686 13.0
1 Private HS-grad Married-civ-spouse Transport-moving Husband Other False 26.0 328662.994908 9.0
2 Private 11th Divorced Other-service Not-in-family White False 53.0 209021.999974 7.0
3 Private HS-grad Widowed Adm-clerical Unmarried White False 46.0 162029.999608 9.0
4 Self-emp-inc Assoc-voc Married-civ-spouse Exec-managerial Husband White False 49.0 349229.997552 11.0
5 Local-gov Some-college Married-civ-spouse Exec-managerial Husband White False 34.0 124826.999041 10.0
6 Self-emp-inc Some-college Married-civ-spouse Sales Husband White False 53.0 290640.000020 10.0
7 Private Some-college Never-married Sales Own-child White False 19.0 106273.001990 10.0
8 Private Some-college Married-civ-spouse Protective-serv Husband Black False 72.0 53684.003204 10.0
9 Private Some-college Never-married Sales Own-child White False 20.0 505979.987402 10.0
# Create a TabWeightedDL
train_ds = to.train
weights = np.random.random(len(train_ds))
train_dl = TabWeightedDL(train_ds, wgts=weights, bs=64, shuffle=True)

train_dl.show_batch()
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num salary
0 Private HS-grad Widowed Craft-repair Unmarried White False 58.000000 178644.000556 9.0 <50k
1 Private HS-grad Married-civ-spouse Handlers-cleaners Husband White False 21.000000 131811.002389 9.0 <50k
2 Private 10th Never-married #na# Own-child White True 16.999999 294485.002377 10.0 <50k
3 Private HS-grad Widowed Adm-clerical Not-in-family White False 66.000001 98836.997894 9.0 <50k
4 Self-emp-not-inc HS-grad Married-civ-spouse Craft-repair Husband White False 28.000000 420054.007486 9.0 <50k
5 Federal-gov Assoc-voc Divorced Craft-repair Not-in-family White False 43.000000 92774.997808 11.0 <50k
6 ? Some-college Never-married ? Own-child White False 19.000000 234518.999208 10.0 <50k
7 Private HS-grad Married-civ-spouse Craft-repair Husband White False 35.000000 186009.000006 9.0 >=50k
8 Federal-gov HS-grad Never-married Adm-clerical Own-child Black False 25.000000 144258.998968 9.0 <50k
9 ? Some-college Never-married ? Own-child White False 20.000000 117788.996828 10.0 <50k

TabDataLoader’s create_item method

df = pd.DataFrame([{'age': 35}])
to = TabularPandas(df)
dls = to.dataloaders()
print(dls.create_item(0))
# test_eq(dls.create_item(0).items.to_dict(), {'age': 0.5330614747286777, 'workclass': 5, 'fnlwgt': -0.26305443080666174, 'education': 10, 'education-num': 1.169790230219763, 'marital-status': 1, 'occupation': 13, 'relationship': 5, 'race': 3, 'sex': ' Female', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 35, 'native-country': 'United-States', 'salary': 1, 'education-num_na': 1})
age    35
Name: 0, dtype: int8

Other target types

Multi-label categories

one-hot encoded label

def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country salary male white
0 49 Private 101320 Assoc-acdm 12.0 Married-civ-spouse NaN Wife White Female 0 1902 40 United-States True False True
1 44 Private 236746 Masters 14.0 Divorced Exec-managerial Not-in-family White Male 10520 0 45 United-States True True True
2 38 Private 96185 HS-grad NaN Divorced NaN Unmarried Black Female 0 0 32 United-States False False False
3 38 Self-emp-inc 112847 Prof-school 15.0 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 United-States True True False
4 42 Self-emp-not-inc 82297 7th-8th NaN Married-civ-spouse Other-service Wife Black Female 0 0 50 United-States False False False
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]
CPU times: user 30.8 ms, sys: 774 us, total: 31.6 ms
Wall time: 31.4 ms
dls = to.dataloaders()
dls.valid.show_batch()
workclass education marital-status occupation relationship race education-num_na age fnlwgt education-num salary male white
0 Private HS-grad Married-civ-spouse Machine-op-inspct Wife Black False 35.000000 110668.001907 9.0 False False False
1 Private HS-grad Married-civ-spouse Craft-repair Husband White False 51.000000 33303.995791 9.0 False True True
2 Private Some-college Never-married Adm-clerical Not-in-family White False 37.000000 38468.006454 10.0 False False True
3 ? Some-college Married-civ-spouse ? Husband White False 63.999999 108082.000741 10.0 False True True
4 Private Bachelors Married-civ-spouse Prof-specialty Husband White False 38.000000 187747.999948 13.0 True True True
5 Private HS-grad Never-married Tech-support Not-in-family Asian-Pac-Islander False 28.000000 375313.002386 9.0 False True False
6 Private HS-grad Divorced #na# Unmarried White True 36.000000 130199.998135 10.0 False True True
7 Private 11th Married-civ-spouse Craft-repair Husband White False 36.000000 123151.001427 7.0 False True True
8 Private Bachelors Never-married Adm-clerical Not-in-family Asian-Pac-Islander False 32.000000 107218.002435 13.0 False True False
9 Private 11th Divorced Other-service Unmarried White False 32.000000 185732.000241 7.0 False False True

Not one-hot encoded

def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country salary target
0 49 Private 101320 Assoc-acdm 12.0 Married-civ-spouse NaN Wife White Female 0 1902 40 United-States >=50k >50k white
1 44 Private 236746 Masters 14.0 Divorced Exec-managerial Not-in-family White Male 10520 0 45 United-States >=50k >50k male white
2 38 Private 96185 HS-grad NaN Divorced NaN Unmarried Black Female 0 0 32 United-States <50k
3 38 Self-emp-inc 112847 Prof-school 15.0 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 United-States >=50k >50k male
4 42 Self-emp-not-inc 82297 7th-8th NaN Married-civ-spouse Other-service Wife Black Female 0 0 50 United-States <50k
@MultiCategorize
def encodes(self, to:Tabular):
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to

@MultiCategorize
def decodes(self, to:Tabular):
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
CPU times: user 10.5 ms, sys: 201 us, total: 10.7 ms
Wall time: 10.6 ms
to.procs[2].vocab
['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']

Regression

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
CPU times: user 21.8 ms, sys: 969 us, total: 22.7 ms
Wall time: 21.9 ms
to.procs[-1].means
{'fnlwgt': np.float64(192511.077125),
 'education-num': np.float64(10.076749801635742)}
dls = to.dataloaders()
dls.valid.show_batch()
workclass education marital-status occupation relationship race education-num_na fnlwgt education-num age
0 Local-gov Some-college Married-spouse-absent Exec-managerial Unmarried Black False 216129.000726 10.0 38.0
1 Private HS-grad Divorced Protective-serv Not-in-family Black False 162814.000297 9.0 34.0
2 Private Bachelors Married-civ-spouse Exec-managerial Husband White False 193881.999975 13.0 44.0
3 Local-gov HS-grad Married-civ-spouse Exec-managerial Husband White False 144778.000789 9.0 43.0
4 Private HS-grad Never-married Other-service Own-child White False 304385.993937 9.0 24.0
5 Private 5th-6th Never-married Craft-repair Not-in-family White False 155621.001484 3.0 28.0
6 Self-emp-not-inc 7th-8th Divorced Other-service Unmarried White False 385632.000374 4.0 56.0
7 Private Bachelors Never-married Prof-specialty Own-child White False 244365.998597 13.0 22.0
8 Private 11th Divorced Handlers-cleaners Own-child White False 112262.998271 7.0 30.0
9 Local-gov Some-college Married-civ-spouse Protective-serv Husband White False 195258.000082 10.0 41.0

Not being used now - for multi-modal

class TensorTabular(fastuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")