df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))
For example if we have a series of dates we can then generate features such as Year
, Month
, Day
, Dayofweek
, Is_month_start
, etc as shown below:
df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df.head()
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df.head()
This function works by determining if a column is continuous or categorical based on the cardinality of its values. If it is above the max_card
parameter (or a float
datatype) then it will be added to the cont_names
else cat_names
. An example is below:
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],
'i8': pd.Series([1, 2, 3, 4], dtype='int8'),
'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),
'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
cont_names, cat_names = cont_cat_split(df)
df = pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
})
df = add_datepart(df, 'd1_date', drop=False)
df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True, inplace=True)
cont_names, cat_names = cont_cat_split(df, max_card=0)
For example we will make a sample DataFrame
with int
, float
, bool
, and object
datatypes:
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypes
We can then call df_shrink_dtypes
to find the smallest possible datatype that can support the data:
dt = df_shrink_dtypes(df)
dt
df_shrink(df)
attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:
boolean
,category
,datetime64[ns]
dtype columns are ignored.- 'object' type columns are categorified, which can save a lot of memory in large dataset. It can be turned off by
obj2cat=False
. int2uint=True
, to fitint
types touint
types, if all data in the column is >= 0.- columns can be excluded by name using
excl_cols=['col1','col2']
.
To get only new column data types without actually casting a DataFrame,
use df_shrink_dtypes()
with all the same parameters for df_shrink()
.
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])
Let's compare the two:
df.dtypes
df2.dtypes
We can see that the datatypes changed, and even further we can look at their relative memory usages:
Here's another example using the ADULT_SAMPLE
dataset:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)
We reduced the overall memory used by 79%!
df
: ADataFrame
of your datacat_names
: Your categoricalx
variablescont_names
: Your continuousx
variablesy_names
: Your dependenty
variables- Note: Mixed y's such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
y_block
: How to sub-categorize the type ofy_names
(CategoryBlock
orRegressionBlock
)splits
: How to split your datado_setup
: A parameter for ifTabular
will run the data through theprocs
upon initializationdevice
:cuda
orcpu
inplace
: IfTrue
,Tabular
will not keep a separate copy of your originalDataFrame
in memory. You should ensurepd.options.mode.chained_assignment
isNone
before setting thisreduce_memory
:fastai
will attempt to reduce the overall memory usage by the inputtedDataFrame
withdf_shrink
These transforms are applied as soon as the data is available rather than as data is called from the DataLoader
While visually in the DataFrame
you will not see a change, the classes are stored in to.procs.categorify
as we can see below on a dummy DataFrame
:
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
to.show()
Each column's unique values are stored in a dictionary of column:[values]
:
cat = to.procs.categorify
cat.classes
Currently, filling with the median
, a constant
, and the mode
are supported.
Integration example
For a more in-depth explanation, see the tabular tutorial
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)
dls = to.dataloaders()
dls.valid.show_batch()
to.show()
We can decode any set of transformed data by calling to.decode_row
with our raw data:
row = to.items.iloc[0]
to.decode_row(row)
We can make new test datasets based on the training data with the to.new()
to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()
We can then convert it to a DataLoader
:
tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()
def _mock_multi_label(df):
sal,sex,white = [],[],[]
for row in df.itertuples():
sal.append(row.salary == '>=50k')
sex.append(row.sex == ' Male')
white.append(row.race == ' White')
df['salary'] = np.array(sal)
df['male'] = np.array(sex)
df['white'] = np.array(white)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)
dls = to.dataloaders()
dls.valid.show_batch()
def _mock_multi_label(df):
targ = []
for row in df.itertuples():
labels = []
if row.salary == '>=50k': labels.append('>50k')
if row.sex == ' Male': labels.append('male')
if row.race == ' White': labels.append('white')
targ.append(' '.join(labels))
df['target'] = np.array(targ)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
@MultiCategorize
def encodes(self, to:Tabular):
#to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@MultiCategorize
def decodes(self, to:Tabular):
#to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)
to.procs[2].vocab
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)
to.procs[-1].means
dls = to.dataloaders()
dls.valid.show_batch()
class TensorTabular(fastuple):
def get_ctxs(self, max_n=10, **kwargs):
n_samples = min(self[0].shape[0], max_n)
df = pd.DataFrame(index = range(n_samples))
return [df.iloc[i] for i in range(n_samples)]
def display(self, ctxs): display_df(pd.DataFrame(ctxs))
class TabularLine(pd.Series):
"A line of a dataframe that knows how to show itself"
def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)
class ReadTabLine(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row):
cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
return TensorTabular(tensor(cats).long(),tensor(conts).float())
def decodes(self, o):
to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
to = self.proc.decode(to)
return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))
class ReadTabTarget(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)
# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')
# test_stdout(lambda: print(show_at(tds, 1)), """a 1
# b_na False
# b 1
# category a
# dtype: object""")