df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[us]'))Tabular core
DataLoaders.
Initial preprocessing
make_date
def make_date(
df, date_field
):
Make sure df[date_field] is of the right date type.
add_datepart
def add_datepart(
df, field_name, prefix:NoneType=None, drop:bool=True, time:bool=False
):
Helper function that adds columns relevant to a date in the column field_name of df.
For example if we have a series of dates we can then generate features such as Year, Month, Day, Dayofweek, Is_month_start, etc as shown below:
df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df.head()| Year | Month | Week | Day | Dayofweek | Dayofyear | Is_month_end | Is_month_start | Is_quarter_end | Is_quarter_start | Is_year_end | Is_year_start | Elapsed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019.0 | 12.0 | 49.0 | 4.0 | 2.0 | 338.0 | False | False | False | False | False | False | 1575417.0 |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | False | False | False | False | False | False | NaN |
| 2 | 2019.0 | 11.0 | 46.0 | 15.0 | 4.0 | 319.0 | False | False | False | False | False | False | 1573776.0 |
| 3 | 2019.0 | 10.0 | 43.0 | 24.0 | 3.0 | 297.0 | False | False | False | False | False | False | 1571875.0 |
add_elapsed_times
def add_elapsed_times(
df, field_names, date_field, base_field
):
Add in df for each event in field_names the elapsed time according to date_field grouped by base_field
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df.head()| date | event | base | Afterevent | Beforeevent | event_bw | event_fw | |
|---|---|---|---|---|---|---|---|
| 0 | 2019-12-04 | False | 1 | 5 | 0 | 1.0 | 0.0 |
| 1 | 2019-11-29 | True | 1 | 0 | 0 | 1.0 | 1.0 |
| 2 | 2019-11-15 | False | 2 | 22 | 0 | 1.0 | 0.0 |
| 3 | 2019-10-24 | True | 2 | 0 | 0 | 1.0 | 1.0 |
cont_cat_split
def cont_cat_split(
df, max_card:int=20, dep_var:NoneType=None
):
Helper function that returns column names of cont and cat variables from given df.
This function works by determining if a column is continuous or categorical based on the cardinality of its values. If it is above the max_card parameter (or a float datatype) then it will be added to the cont_names else cat_names. An example is below:
# Example with simple numpy types
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],
'i8': pd.Series([1, 2, 3, 4], dtype='int8'),
'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),
'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
cont_names, cat_names = cont_cat_split(df)cont_names: ['cont1', 'f16']
cat_names: ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']`
# Example with pandas types and generated columns
df = pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
})
df = add_datepart(df, 'd1_date', drop=False)
df['cat1'] = df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True)
cont_names, cat_names = cont_cat_split(df, max_card=0)cont_names: ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed']
cat_names: ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']
df_shrink_dtypes
def df_shrink_dtypes(
df, skip:list=[], obj2cat:bool=True, int2uint:bool=False
):
Return any possible smaller data types for DataFrame columns. Allows object->category, int->uint, and exclusion.
For example we will make a sample DataFrame with int, float, bool, and object datatypes:
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypesi int64
f float64
e bool
date str
dtype: object
We can then call df_shrink_dtypes to find the smallest possible datatype that can support the data:
dt = df_shrink_dtypes(df)
dt{'i': dtype('int8'), 'f': dtype('float32'), 'date': 'category'}
df_shrink
def df_shrink(
df, skip:list=[], obj2cat:bool=True, int2uint:bool=False
):
Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes().
df_shrink(df) attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:
boolean,category,datetime64[ns]dtype columns are ignored.- ‘object’ type columns are categorified, which can save a lot of memory in large dataset. It can be turned off by
obj2cat=False. int2uint=True, to fitinttypes touinttypes, if all data in the column is >= 0.- columns can be excluded by name using
excl_cols=['col1','col2'].
To get only new column data types without actually casting a DataFrame, use df_shrink_dtypes() with all the same parameters for df_shrink().
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])Let’s compare the two:
df.dtypesi int64
f float64
u int64
date str
dtype: object
df2.dtypesi int8
f float32
u int16
date str
dtype: object
We can see that the datatypes changed, and even further we can look at their relative memory usages:
Initial Dataframe: 228 bytes
Reduced Dataframe: 177 bytes
Here’s another example using the ADULT_SAMPLE dataset:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)Initial Dataframe: 3.907452 megabytes
Reduced Dataframe: 0.814989 megabytes
We reduced the overall memory used by 79%!
Tabular
def Tabular(
df, procs:NoneType=None, cat_names:NoneType=None, cont_names:NoneType=None, y_names:NoneType=None,
y_block:NoneType=None, splits:NoneType=None, do_setup:bool=True, device:NoneType=None, inplace:bool=False,
reduce_memory:bool=True
):
A DataFrame wrapper that knows which cols are cont/cat/y, and returns rows in __getitem__
df: ADataFrameof your datacat_names: Your categoricalxvariablescont_names: Your continuousxvariablesy_names: Your dependentyvariables- Note: Mixed y’s such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
y_block: How to sub-categorize the type ofy_names(CategoryBlockorRegressionBlock)splits: How to split your datado_setup: A parameter for ifTabularwill run the data through theprocsupon initializationdevice:cudaorcpuinplace: IfTrue,Tabularwill not keep a separate copy of your originalDataFramein memory. You should ensurepd.options.mode.chained_assignmentisNonebefore setting thisreduce_memory:fastaiwill attempt to reduce the overall memory usage by the inputtedDataFramewithdf_shrink
TabularPandas
def TabularPandas(
df, procs:NoneType=None, cat_names:NoneType=None, cont_names:NoneType=None, y_names:NoneType=None,
y_block:NoneType=None, splits:NoneType=None, do_setup:bool=True, device:NoneType=None, inplace:bool=False,
reduce_memory:bool=True
):
A Tabular object with transforms
TabularProc
def TabularProc(
enc:NoneType=None, dec:NoneType=None, split_idx:NoneType=None, order:NoneType=None
):
Base class to write a non-lazy tabular processor for dataframes
These transforms are applied as soon as the data is available rather than as data is called from the DataLoader
Categorify
def Categorify(
enc:NoneType=None, dec:NoneType=None, split_idx:NoneType=None, order:NoneType=None
):
Transform the categorical variables to something similar to pd.Categorical
While visually in the DataFrame you will not see a change, the classes are stored in to.procs.categorify as we can see below on a dummy DataFrame:
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
to.show()| a | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 2 |
| 3 | 0 |
| 4 | 2 |
Each column’s unique values are stored in a dictionary of column:[values]:
cat = to.procs.categorify
cat.classes{'a': ['#na#', np.int8(0), np.int8(1), np.int8(2)]}
FillStrategy
def FillStrategy(
args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):
Namespace containing the various filling strategies.
Currently, filling with the median, a constant, and the mode are supported.
FillMissing
def FillMissing(
fill_strategy:function=median, add_col:bool=True, fill_vals:NoneType=None
):
Fill the missing values in continuous columns.
ReadTabBatch
def ReadTabBatch(
to
):
Transform TabularPandas values into a Tensor with the ability to decode
TabDataLoader
def TabDataLoader(
dataset, # Map- or iterable-style dataset from which to load the data
bs:int=16, # Size of batch
shuffle:bool=False, # Whether to shuffle data
after_batch:NoneType=None, num_workers:int=0, verbose:bool=False, # Whether to print verbose logs
do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None,
retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None,
do_batch:NoneType=None
):
A transformed DataLoader for Tabular data
TabWeightedDL
def TabWeightedDL(
dataset, # Map- or iterable-style dataset from which to load the data
bs:int=16, # Size of batch
wgts:NoneType=None, shuffle:bool=False, # Whether to shuffle data
after_batch:NoneType=None, num_workers:int=0, verbose:bool=False, # Whether to print verbose logs
do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None,
retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None,
do_batch:NoneType=None
):
A transformed DataLoader for Tabular Weighted data
Integration example
For a more in-depth explanation, see the tabular tutorial
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)dls = to.dataloaders()
dls.valid.show_batch()/Users/jhoward/aai-ws/fastai/fastai/torch_core.py:154: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:212.)
else as_tensor(x.values, **kwargs) if isinstance(x, (pd.Series, pd.DataFrame))
| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | State-gov | Bachelors | Never-married | Prof-specialty | Not-in-family | White | False | 23.000000 | 287987.999279 | 13.0 | >=50k |
| 1 | State-gov | Some-college | Never-married | Tech-support | Unmarried | Black | False | 32.000000 | 131587.998019 | 10.0 | <50k |
| 2 | Private | HS-grad | Married-civ-spouse | Other-service | Husband | White | False | 58.000000 | 183810.000021 | 9.0 | <50k |
| 3 | Self-emp-inc | Some-college | Married-civ-spouse | Sales | Husband | White | False | 83.999998 | 172906.999298 | 10.0 | >=50k |
| 4 | Local-gov | Bachelors | Never-married | Sales | Own-child | White | False | 31.000000 | 128016.002079 | 13.0 | <50k |
| 5 | Private | Some-college | Never-married | Adm-clerical | Not-in-family | White | False | 22.000001 | 58915.997617 | 10.0 | <50k |
| 6 | ? | 11th | Never-married | ? | Not-in-family | White | False | 26.000000 | 176966.999519 | 7.0 | <50k |
| 7 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Other-relative | Asian-Pac-Islander | False | 23.999999 | 86744.998003 | 13.0 | <50k |
| 8 | Private | Some-college | Never-married | Other-service | Not-in-family | White | False | 38.000000 | 123833.001583 | 10.0 | <50k |
| 9 | Private | 11th | Married-civ-spouse | Farming-fishing | Husband | White | False | 35.000000 | 168322.000103 | 7.0 | <50k |
to.show()| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 3564 | Self-emp-not-inc | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 50.0 | 124793.0 | 9.0 | <50k |
| 5802 | Private | HS-grad | Never-married | Adm-clerical | Own-child | White | False | 30.0 | 205204.0 | 9.0 | <50k |
| 1087 | Self-emp-not-inc | HS-grad | Married-civ-spouse | Farming-fishing | Husband | White | False | 55.0 | 149168.0 | 9.0 | <50k |
| 1239 | Private | HS-grad | Never-married | Other-service | Own-child | White | False | 25.0 | 104193.0 | 9.0 | <50k |
| 2234 | Private | HS-grad | Never-married | Handlers-cleaners | Other-relative | White | False | 22.0 | 361138.0 | 9.0 | <50k |
| 8032 | Private | Bachelors | Married-civ-spouse | Sales | Husband | White | False | 54.0 | 391016.0 | 13.0 | >=50k |
| 4364 | Private | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 45.0 | 380922.0 | 9.0 | >=50k |
| 5404 | Self-emp-inc | HS-grad | Married-civ-spouse | Farming-fishing | Husband | White | False | 60.0 | 160062.0 | 9.0 | <50k |
| 693 | State-gov | Masters | Never-married | Prof-specialty | Not-in-family | Asian-Pac-Islander | True | 27.0 | 315640.0 | 10.0 | <50k |
| 2684 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Farming-fishing | Husband | White | False | 84.0 | 155057.0 | 13.0 | <50k |
We can decode any set of transformed data by calling to.decode_row with our raw data:
row = to.items.iloc[0]
to.decode_row(row)age 50.0
workclass Self-emp-not-inc
fnlwgt 124793.0
education HS-grad
education-num 9.0
marital-status Married-civ-spouse
occupation Craft-repair
relationship Husband
race White
sex Male
capital-gain 0
capital-loss 0
hours-per-week 30
native-country United-States
salary <50k
education-num_na False
Name: 3564, dtype: object
We can make new test datasets based on the training data with the to.new()
Since machine learning models can’t magically understand categories it was never trained on, the data should reflect this. If there are different missing values in your test data you should address this before training
to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | education-num_na | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10000 | 0.470185 | 5 | 1.346422 | 10 | 1.165893 | 3 | 2 | 1 | 2 | Male | 0 | 0 | 40 | Philippines | 1 |
| 10001 | -0.923825 | 5 | 1.258980 | 12 | -0.424752 | 3 | 15 | 1 | 4 | Male | 0 | 0 | 40 | United-States | 1 |
| 10002 | 1.057137 | 5 | 0.150986 | 2 | -1.220074 | 1 | 9 | 2 | 5 | Female | 0 | 0 | 37 | United-States | 1 |
| 10003 | 0.543554 | 5 | -0.284206 | 12 | -0.424752 | 7 | 2 | 5 | 5 | Female | 0 | 0 | 43 | United-States | 1 |
| 10004 | 0.763661 | 6 | 1.449451 | 9 | 0.370571 | 3 | 5 | 1 | 5 | Male | 0 | 0 | 60 | United-States | 1 |
We can then convert it to a DataLoader:
tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Bachelors | Married-civ-spouse | Adm-clerical | Husband | Asian-Pac-Islander | False | 45.0 | 338104.995686 | 13.0 |
| 1 | Private | HS-grad | Married-civ-spouse | Transport-moving | Husband | Other | False | 26.0 | 328662.994908 | 9.0 |
| 2 | Private | 11th | Divorced | Other-service | Not-in-family | White | False | 53.0 | 209021.999974 | 7.0 |
| 3 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | False | 46.0 | 162029.999608 | 9.0 |
| 4 | Self-emp-inc | Assoc-voc | Married-civ-spouse | Exec-managerial | Husband | White | False | 49.0 | 349229.997552 | 11.0 |
| 5 | Local-gov | Some-college | Married-civ-spouse | Exec-managerial | Husband | White | False | 34.0 | 124826.999041 | 10.0 |
| 6 | Self-emp-inc | Some-college | Married-civ-spouse | Sales | Husband | White | False | 53.0 | 290640.000020 | 10.0 |
| 7 | Private | Some-college | Never-married | Sales | Own-child | White | False | 19.0 | 106273.001990 | 10.0 |
| 8 | Private | Some-college | Married-civ-spouse | Protective-serv | Husband | Black | False | 72.0 | 53684.003204 | 10.0 |
| 9 | Private | Some-college | Never-married | Sales | Own-child | White | False | 20.0 | 505979.987402 | 10.0 |
# Create a TabWeightedDL
train_ds = to.train
weights = np.random.random(len(train_ds))
train_dl = TabWeightedDL(train_ds, wgts=weights, bs=64, shuffle=True)
train_dl.show_batch()| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | HS-grad | Widowed | Craft-repair | Unmarried | White | False | 58.000000 | 178644.000556 | 9.0 | <50k |
| 1 | Private | HS-grad | Married-civ-spouse | Handlers-cleaners | Husband | White | False | 21.000000 | 131811.002389 | 9.0 | <50k |
| 2 | Private | 10th | Never-married | #na# | Own-child | White | True | 16.999999 | 294485.002377 | 10.0 | <50k |
| 3 | Private | HS-grad | Widowed | Adm-clerical | Not-in-family | White | False | 66.000001 | 98836.997894 | 9.0 | <50k |
| 4 | Self-emp-not-inc | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 28.000000 | 420054.007486 | 9.0 | <50k |
| 5 | Federal-gov | Assoc-voc | Divorced | Craft-repair | Not-in-family | White | False | 43.000000 | 92774.997808 | 11.0 | <50k |
| 6 | ? | Some-college | Never-married | ? | Own-child | White | False | 19.000000 | 234518.999208 | 10.0 | <50k |
| 7 | Private | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 35.000000 | 186009.000006 | 9.0 | >=50k |
| 8 | Federal-gov | HS-grad | Never-married | Adm-clerical | Own-child | Black | False | 25.000000 | 144258.998968 | 9.0 | <50k |
| 9 | ? | Some-college | Never-married | ? | Own-child | White | False | 20.000000 | 117788.996828 | 10.0 | <50k |
TabDataLoader’s create_item method
df = pd.DataFrame([{'age': 35}])
to = TabularPandas(df)
dls = to.dataloaders()
print(dls.create_item(0))
# test_eq(dls.create_item(0).items.to_dict(), {'age': 0.5330614747286777, 'workclass': 5, 'fnlwgt': -0.26305443080666174, 'education': 10, 'education-num': 1.169790230219763, 'marital-status': 1, 'occupation': 13, 'relationship': 5, 'race': 3, 'sex': ' Female', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 35, 'native-country': 'United-States', 'salary': 1, 'education-num_na': 1})age 35
Name: 0, dtype: int8
Other target types
Multi-label categories
one-hot encoded label
def _mock_multi_label(df):
sal,sex,white = [],[],[]
for row in df.itertuples():
sal.append(row.salary == '>=50k')
sex.append(row.sex == ' Male')
white.append(row.race == ' White')
df['salary'] = np.array(sal)
df['male'] = np.array(sex)
df['white'] = np.array(white)
return dfpath = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)df_main.head()| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | True | False | True |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | True | True | True |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | False | False | False |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | True | True | False |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | False | False | False |
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]CPU times: user 30.8 ms, sys: 774 us, total: 31.6 ms
Wall time: 31.4 ms
dls = to.dataloaders()
dls.valid.show_batch()| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Wife | Black | False | 35.000000 | 110668.001907 | 9.0 | False | False | False |
| 1 | Private | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 51.000000 | 33303.995791 | 9.0 | False | True | True |
| 2 | Private | Some-college | Never-married | Adm-clerical | Not-in-family | White | False | 37.000000 | 38468.006454 | 10.0 | False | False | True |
| 3 | ? | Some-college | Married-civ-spouse | ? | Husband | White | False | 63.999999 | 108082.000741 | 10.0 | False | True | True |
| 4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Husband | White | False | 38.000000 | 187747.999948 | 13.0 | True | True | True |
| 5 | Private | HS-grad | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | False | 28.000000 | 375313.002386 | 9.0 | False | True | False |
| 6 | Private | HS-grad | Divorced | #na# | Unmarried | White | True | 36.000000 | 130199.998135 | 10.0 | False | True | True |
| 7 | Private | 11th | Married-civ-spouse | Craft-repair | Husband | White | False | 36.000000 | 123151.001427 | 7.0 | False | True | True |
| 8 | Private | Bachelors | Never-married | Adm-clerical | Not-in-family | Asian-Pac-Islander | False | 32.000000 | 107218.002435 | 13.0 | False | True | False |
| 9 | Private | 11th | Divorced | Other-service | Unmarried | White | False | 32.000000 | 185732.000241 | 7.0 | False | False | True |
Not one-hot encoded
def _mock_multi_label(df):
targ = []
for row in df.itertuples():
labels = []
if row.salary == '>=50k': labels.append('>50k')
if row.sex == ' Male': labels.append('male')
if row.race == ' White': labels.append('white')
targ.append(' '.join(labels))
df['target'] = np.array(targ)
return dfpath = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)df_main.head()| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k | >50k white |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k | >50k male white |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k | |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k | >50k male |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
@MultiCategorize
def encodes(self, to:Tabular):
#to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@MultiCategorize
def decodes(self, to:Tabular):
#to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return tocat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))CPU times: user 10.5 ms, sys: 201 us, total: 10.7 ms
Wall time: 10.6 ms
to.procs[2].vocab['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
Regression
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))CPU times: user 21.8 ms, sys: 969 us, total: 22.7 ms
Wall time: 21.9 ms
to.procs[-1].means{'fnlwgt': np.float64(192511.077125),
'education-num': np.float64(10.076749801635742)}
dls = to.dataloaders()
dls.valid.show_batch()| workclass | education | marital-status | occupation | relationship | race | education-num_na | fnlwgt | education-num | age | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Local-gov | Some-college | Married-spouse-absent | Exec-managerial | Unmarried | Black | False | 216129.000726 | 10.0 | 38.0 |
| 1 | Private | HS-grad | Divorced | Protective-serv | Not-in-family | Black | False | 162814.000297 | 9.0 | 34.0 |
| 2 | Private | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 193881.999975 | 13.0 | 44.0 |
| 3 | Local-gov | HS-grad | Married-civ-spouse | Exec-managerial | Husband | White | False | 144778.000789 | 9.0 | 43.0 |
| 4 | Private | HS-grad | Never-married | Other-service | Own-child | White | False | 304385.993937 | 9.0 | 24.0 |
| 5 | Private | 5th-6th | Never-married | Craft-repair | Not-in-family | White | False | 155621.001484 | 3.0 | 28.0 |
| 6 | Self-emp-not-inc | 7th-8th | Divorced | Other-service | Unmarried | White | False | 385632.000374 | 4.0 | 56.0 |
| 7 | Private | Bachelors | Never-married | Prof-specialty | Own-child | White | False | 244365.998597 | 13.0 | 22.0 |
| 8 | Private | 11th | Divorced | Handlers-cleaners | Own-child | White | False | 112262.998271 | 7.0 | 30.0 |
| 9 | Local-gov | Some-college | Married-civ-spouse | Protective-serv | Husband | White | False | 195258.000082 | 10.0 | 41.0 |
Not being used now - for multi-modal
class TensorTabular(fastuple):
def get_ctxs(self, max_n=10, **kwargs):
n_samples = min(self[0].shape[0], max_n)
df = pd.DataFrame(index = range(n_samples))
return [df.iloc[i] for i in range(n_samples)]
def display(self, ctxs): display_df(pd.DataFrame(ctxs))
class TabularLine(pd.Series):
"A line of a dataframe that knows how to show itself"
def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)
class ReadTabLine(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row):
cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
return TensorTabular(tensor(cats).long(),tensor(conts).float())
def decodes(self, o):
to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
to = self.proc.decode(to)
return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))
class ReadTabTarget(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)
# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')
# test_stdout(lambda: print(show_at(tds, 1)), """a 1
# b_na False
# b 1
# category a
# dtype: object""")