Data core

Core functionality for gathering data
from nbdev.cli import *

The classes here provide functionality for applying a list of transforms to a set of items (TfmdLists, Datasets) or a DataLoader (TfmdDl) as well as the base class used to gather the data for model training: DataLoaders.

show_batch is a type-dispatched function that is responsible for showing decoded samples. x and y are the input and the target in the batch to be shown, and are passed along to dispatch on their types. There is a different implementation of show_batch if x is a TensorImage or a TensorText for instance (see vision.core or text.data for more details). ctxs can be passed but the function is responsible to create them if necessary. kwargs depend on the specific implementation.


source

get_show_batch_func


def get_show_batch_func(
    x_typ:_AnyMeta=Any, y_typ:_AnyMeta=Any, samples_typ:_AnyMeta=Any
):

Helper function to manually get show_batch function for given input types.

show_results is a type-dispatched function that is responsible for showing decoded samples and their corresponding outs. Like in show_batch, x and y are the input and the target in the batch to be shown, and are passed along to dispatch on their types. ctxs can be passed but the function is responsible to create them if necessary. kwargs depend on the specific implementation.


source

TfmdDL


def TfmdDL(
    dataset, # Map- or iterable-style dataset from which to load the data
    bs:int=64, # Size of batch
    shuffle:bool=False, # Whether to shuffle data
    num_workers:int=None, # Number of CPU cores to use in parallel (default: All available up to 16)
    verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
    n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
    wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
    after_batch:NoneType=None, after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None,
    create_batch:NoneType=None, retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None,
    shuffle_fn:NoneType=None, do_batch:NoneType=None
):

Transformed DataLoader

A TfmdDL is a DataLoader that creates Pipeline from a list of Transforms for the callbacks after_item, before_batch and after_batch. As a result, it can decode or show a processed batch.

class _Category(int, ShowTitle): pass
#Test retain type
class NegTfm(Transform):
    def encodes(self, x): return torch.neg(x)
    def decodes(self, x): return torch.neg(x)
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=NegTfm(), bs=4, num_workers=4)
b = tdl.one_batch()
test_eq(type(b[0]), TensorImage)
b = (tensor([1.,1.,1.,1.]),)
test_eq(type(tdl.decode_batch(b)[0][0]), TensorImage)
class A(Transform): 
    def encodes(self, x): return x 
    def decodes(self, x): return TitledInt(x) 

@Transform
def f(x)->None: return fastuple((x,x))

start = torch.arange(50)
test_eq_type(f(2), fastuple((2,2)))
a = A()
tdl = TfmdDL(start, after_item=lambda x: (a(x), f(x)), bs=4)
x,y = tdl.one_batch()
test_eq(type(y), fastuple)

s = tdl.decode_batch((x,y))
test_eq(type(s[0][1]), fastuple)
tdl = TfmdDL(torch.arange(0,50), after_item=A(), after_batch=NegTfm(), bs=4)
test_eq(tdl.dataset[0], start[0])
test_eq(len(tdl), (50-1)//4+1)
test_eq(tdl.bs, 4)
test_stdout(tdl.show_batch, '0\n1\n2\n3')
test_stdout(partial(tdl.show_batch, unique=True), '0\n0\n0\n0')
class B(Transform):
    parameters = 'a'
    def __init__(self): self.a = torch.tensor(0.)
    def encodes(self, x): x
    
tdl = TfmdDL([(TensorImage([1]),)] * 4, after_batch=B(), bs=4)
test_eq(tdl.after_batch.fs[0].a.device, torch.device('cpu'))
tdl.to(default_device())
assert str(tdl.after_batch.fs[0].a.device).startswith(str(default_device()))

Methods


source

DataLoader.one_batch


def one_batch(
    
):

Return one batch from DataLoader.

tfm = NegTfm()
tdl = TfmdDL(start, after_batch=tfm, bs=4)
b = tdl.one_batch()
test_eq(tensor([0,-1,-2,-3]), b)

source

TfmdDL.decode


def decode(
    b, # Batch to decode
):

Decode b using tfms

test_eq(tdl.decode(b), tensor(0,1,2,3))

source

TfmdDL.decode_batch


def decode_batch(
    b, # Batch to decode
    max_n:int=9, # Maximum number of items to decode
    full:bool=True, # Whether to decode all transforms. If `False`, decode up to the point the item knows how to show itself
):

Decode b entirely

test_eq(tdl.decode_batch(b), [0,1,2,3])

source

TfmdDL.show_batch


def show_batch(
    b:NoneType=None, # Batch to show
    max_n:int=9, # Maximum number of items to show
    ctxs:NoneType=None, # List of `ctx` objects to show data. Could be matplotlib axis, DataFrame etc
    show:bool=True, # Whether to display data
    unique:bool=False, # Whether to show only one
    kwargs:VAR_KEYWORD
):

Show b (defaults to one_batch), a list of lists of pipeline outputs (i.e. output of a DataLoader)


source

DataLoader.to


def to(
    device
):

Put self and its transforms state on device


source

DataLoaders


def DataLoaders(
    loaders:VAR_POSITIONAL, # [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader) objects to wrap
    path:str | Path='.', # Path to store export objects
    device:NoneType=None, # Device to put [`DataLoaders`](https://docs.fast.ai/data.core.html#dataloaders)
):

Basic wrapper around several DataLoaders.

dls = DataLoaders(tdl,tdl)
x = dls.train.one_batch()
x2 = first(tdl)
test_eq(x,x2)
x2 = dls.one_batch()
test_eq(x,x2)

Multiple transforms can by added to multiple dataloaders using Dataloaders.add_tfms. You can specify the dataloaders by list of names dls.add_tfms(...,'valid',...) or by index dls.add_tfms(...,1,....), by default transforms are added to all dataloaders. event is a required argument and determined when the transform will be run, for more information on events please refer to TfmdDL. tfms is a list of Transform, and is a required argument.

class _TestTfm(Transform):
    def encodes(self, o):  return torch.ones_like(o)
    def decodes(self, o):  return o
tdl1,tdl2 = TfmdDL(start, bs=4),TfmdDL(start, bs=4)
dls2 = DataLoaders(tdl1,tdl2)
dls2.add_tfms([_TestTfm()],'after_batch',['valid'])
dls2.add_tfms([_TestTfm()],'after_batch',[1])
dls2.train.after_batch,dls2.valid.after_batch,
(Pipeline: , Pipeline: _TestTfm -> _TestTfm)
class _T(Transform):  
    def encodes(self, o):  return -o
class _T2(Transform): 
    def encodes(self, o):  return o/2

#test tfms are applied on both traind and valid dl
dls_from_ds = DataLoaders.from_dsets([1,], [5,], bs=1, after_item=_T, after_batch=_T2)
b = first(dls_from_ds.train)
test_eq(b, tensor([-.5]))
b = first(dls_from_ds.valid)
test_eq(b, tensor([-2.5]))

Methods


source

DataLoaders.__getitem__


def __getitem__(
    i
):

Retrieve DataLoader at i (0 is training, 1 is validation)

x2
tensor([ 0, -1, -2, -3])
x2 = dls[0].one_batch()
test_eq(x,x2)

DataLoaders.train


def <lambda>[partial: 0](
    x
):

partial(func, args, **keywords) - new function with partial application* of the given arguments and keywords.


DataLoaders.valid


def <lambda>[partial: 1](
    x
):

partial(func, args, **keywords) - new function with partial application* of the given arguments and keywords.


DataLoaders.train_ds


def <lambda>[partial: 0](
    x
):

partial(func, args, **keywords) - new function with partial application* of the given arguments and keywords.


DataLoaders.valid_ds


def <lambda>[partial: 1](
    x
):

partial(func, args, **keywords) - new function with partial application* of the given arguments and keywords.


source

FilteredBase


def FilteredBase(
    args:VAR_POSITIONAL, dl_type:NoneType=None, kwargs:VAR_KEYWORD
):

Base class for lists with subsets


source

FilteredBase.dataloaders


def dataloaders(
    bs:int=64, # Batch size
    shuffle_train:bool=None, # (Deprecated, use `shuffle`) Shuffle training [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    shuffle:bool=True, # Shuffle training [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    val_shuffle:bool=False, # Shuffle validation [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    n:int=None, # Size of [`Datasets`](https://docs.fast.ai/data.core.html#datasets) used to create [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    path:str | Path='.', # Path to put in [`DataLoaders`](https://docs.fast.ai/data.core.html#dataloaders)
    dl_type:TfmdDL=None, # Type of [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    dl_kwargs:list=None, # List of kwargs to pass to individual [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)s
    device:torch.device=None, # Device to put [`DataLoaders`](https://docs.fast.ai/data.core.html#dataloaders)
    drop_last:bool=None, # Drop last incomplete batch, defaults to `shuffle`
    val_bs:int=None, # Validation batch size, defaults to `bs`
    num_workers:int=None, # Number of CPU cores to use in parallel (default: All available up to 16)
    verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, indexed:NoneType=None,
    persistent_workers:bool=False, pin_memory_device:str='', wif:NoneType=None, before_iter:NoneType=None,
    after_item:NoneType=None, before_batch:NoneType=None, after_batch:NoneType=None, after_iter:NoneType=None,
    create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None, retain:NoneType=None,
    get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None, do_batch:NoneType=None
)->DataLoaders:

source

TfmdLists


def TfmdLists(
    items:list, # Items to apply `Transform`s to
    tfms:MutableSequence | Pipeline, # `Transform`(s) or `Pipeline` to apply
    use_list:bool=None, # Use `list` in `L`
    do_setup:bool=True, # Call `setup()` for `Transform`
    split_idx:int=None, # Apply `Transform`(s) to training or validation set. `0` for training set and `1` for validation set
    train_setup:bool=True, # Apply `Transform`(s) only on training [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    splits:list=None, # Indices for training and validation sets
    types:NoneType=None, # Types of data in `items`
    verbose:bool=False, # Print verbose output
    dl_type:TfmdDL=None, # Type of [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
):

A Pipeline of tfms applied to a collection of items


source

decode_at


def decode_at(
    o, idx
):

Decoded item at idx

Exported source
def decode_at(o, idx):
    "Decoded item at `idx`"
    return o.decode(o[idx])

source

show_at


def show_at(
    o, idx, kwargs:VAR_KEYWORD
):
Exported source
def show_at(o, idx, **kwargs):
    "Show item at `idx`",
    return o.show(o[idx], **kwargs)

A TfmdLists combines a collection of object with a Pipeline. tfms can either be a Pipeline or a list of transforms, in which case, it will wrap them in a Pipeline. use_list is passed along to L with the items and split_idx are passed to each transform of the Pipeline. do_setup indicates if the Pipeline.setup method should be called during initialization.

class _IntFloatTfm(Transform):
    def encodes(self, o):  return TitledInt(o)
    def decodes(self, o):  return TitledFloat(o)
int2f_tfm=_IntFloatTfm()

def _neg(o): return -o
neg_tfm = Transform(_neg, _neg)
items = L([1.,2.,3.]); tfms = [neg_tfm, int2f_tfm]
tl = TfmdLists(items, tfms=tfms)
test_eq_type(tl[0], TitledInt(-1))
test_eq_type(tl[1], TitledInt(-2))
test_eq_type(tl.decode(tl[2]), TitledFloat(3.))
test_stdout(lambda: show_at(tl, 2), '-3')
test_eq(tl.types, [float, float, TitledInt])
tl
TfmdLists: [1.0, 2.0, 3.0]
tfms - [_neg(enc:1,dec:1), _IntFloatTfm(enc:1,dec:1)]
# add splits to TfmdLists
splits = [[0,2],[1]]
tl = TfmdLists(items, tfms=tfms, splits=splits)
test_eq(tl.n_subsets, 2)
test_eq(tl.train, tl.subset(0))
test_eq(tl.valid, tl.subset(1))
test_eq(tl.train.items, items[splits[0]])
test_eq(tl.valid.items, items[splits[1]])
test_eq(tl.train.tfms.split_idx, 0)
test_eq(tl.valid.tfms.split_idx, 1)
test_eq(tl.train.new_empty().split_idx, 0)
test_eq(tl.valid.new_empty().split_idx, 1)
test_eq_type(tl.splits, L(splits))
assert not tl.overlapping_splits()
df = pd.DataFrame(dict(a=[1,2,3],b=[2,3,4]))
tl = TfmdLists(df, lambda o: o.a+1, splits=[[0],[1,2]])
test_eq(tl[1,2], [3,4])
tr = tl.subset(0)
test_eq(tr[:], [2])
val = tl.subset(1)
test_eq(val[:], [3,4])
class _B(Transform):
    def __init__(self): self.m = 0
    def encodes(self, o): return o+self.m
    def decodes(self, o): return o-self.m
    def setups(self, items): 
        print(items)
        self.m = tensor(items).float().mean().item()

# test for setup, which updates `self.m`
tl = TfmdLists(items, _B())
test_eq(tl.m, 2)
TfmdLists: [1.0, 2.0, 3.0]
tfms - []

Here’s how we can use TfmdLists.setup to implement a simple category list, getting labels from a mock file list:

class _Cat(Transform):
    order = 1
    def encodes(self, o):    return int(self.o2i[o])
    def decodes(self, o):    return TitledStr(self.vocab[o])
    def setups(self, items): self.vocab,self.o2i = uniqueify(L(items), sort=True, bidir=True)
tcat = _Cat()

def _lbl(o): return TitledStr(o.split('_')[0])

# Check that tfms are sorted by `order` & `_lbl` is called first
fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg']
tl = TfmdLists(fns, [tcat,_lbl])
exp_voc = ['cat','dog']
test_eq(tcat.vocab, exp_voc)
test_eq(tl.tfms.vocab, exp_voc)
test_eq(tl.vocab, exp_voc)
test_eq(tl, (1,0,0,0,1))
test_eq([tl.decode(o) for o in tl], ('dog','cat','cat','cat','dog'))
#Check only the training set is taken into account for setup
tl = TfmdLists(fns, [tcat,_lbl], splits=[[0,4], [1,2,3]])
test_eq(tcat.vocab, ['dog'])
tfm = NegTfm(split_idx=1)
tds = TfmdLists(start, A())
tdl = TfmdDL(tds, after_batch=tfm, bs=4)
x = tdl.one_batch()
test_eq(x, torch.arange(4))
tds.split_idx = 1
x = tdl.one_batch()
test_eq(x, -torch.arange(4))
tds.split_idx = 0
x = tdl.one_batch()
test_eq(x, torch.arange(4))
tds = TfmdLists(start, A())
tdl = TfmdDL(tds, after_batch=NegTfm(), bs=4)
test_eq(tdl.dataset[0], start[0])
test_eq(len(tdl), (len(tds)-1)//4+1)
test_eq(tdl.bs, 4)
test_stdout(tdl.show_batch, '0\n1\n2\n3')

source

TfmdLists.subset


def subset(
    i
):

New TfmdLists with same tfms that only includes items in ith split


source

TfmdLists.infer_idx


def infer_idx(
    x
):

Finds the index where self.tfms can be applied to x, depending on the type of x


source

TfmdLists.infer


def infer(
    x
):

Apply self.tfms to x starting at the right tfm depending on the type of x

def mult(x): return x*2
mult.order = 2

fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg']
tl = TfmdLists(fns, [_lbl,_Cat(),mult])

test_eq(tl.infer_idx('dog_45.jpg'), 0)
test_eq(tl.infer('dog_45.jpg'), 2)

test_eq(tl.infer_idx(4), 2)
test_eq(tl.infer(4), 8)

test_fail(lambda: tl.infer_idx(2.0))
test_fail(lambda: tl.infer(2.0))

source

Datasets


def Datasets(
    items:list=None, # List of items to create [`Datasets`](https://docs.fast.ai/data.core.html#datasets)
    tfms:MutableSequence | Pipeline=None, # List of `Transform`(s) or `Pipeline` to apply
    tls:TfmdLists=None, # If None, `self.tls` is generated from `items` and `tfms`
    n_inp:int=None, # Number of elements in [`Datasets`](https://docs.fast.ai/data.core.html#datasets) tuple that should be considered part of input
    dl_type:NoneType=None, # Default type of [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader) used when function [`FilteredBase.dataloaders`](https://docs.fast.ai/data.core.html#filteredbase.dataloaders) is called
    kwargs:VAR_KEYWORD
):

A dataset that creates a tuple from each tfms

A Datasets creates a tuple from items (typically input,target) by applying to them each list of Transform (or Pipeline) in tfms. Note that if tfms contains only one list of tfms, the items given by Datasets will be tuples of one element.

n_inp is the number of elements in the tuples that should be considered part of the input and will default to 1 if tfms consists of one set of transforms, len(tfms)-1 otherwise. In most cases, the number of elements in the tuples spit out by Datasets will be 2 (for input,target) but it can happen that there is 3 (Siamese networks or tabular data) in which case we need to be able to determine when the inputs end and the targets begin.

items = [1,2,3,4]
dsets = Datasets(items, [[neg_tfm,int2f_tfm], [add(1)]])
t = dsets[0]
test_eq(t, (-1,2))
test_eq(dsets[0,1,2], [(-1,2),(-2,3),(-3,4)])
test_eq(dsets.n_inp, 1)
dsets.decode(t)
(1.0, 2)
class Norm(Transform):
    def encodes(self, o): return (o-self.m)/self.s
    def decodes(self, o): return (o*self.s)+self.m
    def setups(self, items):
        its = tensor(items).float()
        self.m,self.s = its.mean(),its.std()
items = [1,2,3,4]
nrm = Norm()
dsets = Datasets(items, [[neg_tfm,int2f_tfm], [neg_tfm,nrm]])

x,y = zip(*dsets)
test_close(tensor(y).mean(), 0)
test_close(tensor(y).std(), 1)
test_eq(x, (-1,-2,-3,-4,))
test_eq(nrm.m, -2.5)
test_stdout(lambda:show_at(dsets, 1), '-2')

test_eq(dsets.m, nrm.m)
test_eq(dsets.norm.m, nrm.m)
test_eq(dsets.train.norm.m, nrm.m)
test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','kid_1.jpg']
tcat = _Cat()
dsets = Datasets(test_fns, [[tcat,_lbl]], splits=[[0,1,2], [3,4]])
test_eq(tcat.vocab, ['cat','dog'])
test_eq(dsets.train, [(1,),(0,),(0,)])
test_eq(dsets.valid[0], (0,))
test_stdout(lambda: show_at(dsets.train, 0), "dog")
inp = [0,1,2,3,4]
dsets = Datasets(inp, tfms=[None])

test_eq(*dsets[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsets[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsets[mask], [(0,),(3,)])   # Retrieve two items by mask
inp = pd.DataFrame(dict(a=[5,1,2,3,4]))
dsets = Datasets(inp, tfms=attrgetter('a')).subset(0)
test_eq(*dsets[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsets[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsets[mask], [(5,),(3,)])   # Retrieve two items by mask
#test n_inp
inp = [0,1,2,3,4]
dsets = Datasets(inp, tfms=[None])
test_eq(dsets.n_inp, 1)
dsets = Datasets(inp, tfms=[[None],[None],[None]])
test_eq(dsets.n_inp, 2)
dsets = Datasets(inp, tfms=[[None],[None],[None]], n_inp=1)
test_eq(dsets.n_inp, 1)
# splits can be indices
dsets = Datasets(range(5), tfms=[None], splits=[tensor([0,2]), [1,3,4]])

test_eq(dsets.subset(0), [(0,),(2,)])
test_eq(dsets.train, [(0,),(2,)])       # Subset 0 is aliased to `train`
test_eq(dsets.subset(1), [(1,),(3,),(4,)])
test_eq(dsets.valid, [(1,),(3,),(4,)])     # Subset 1 is aliased to `valid`
test_eq(*dsets.valid[2], 4)
#assert '[(1,),(3,),(4,)]' in str(dsets) and '[(0,),(2,)]' in str(dsets)
dsets
(#5) [(0,),(1,),(2,),(3,),(4,)]
# splits can be boolean masks (they don't have to cover all items, but must be disjoint)
splits = [[False,True,True,False,True], [True,False,False,False,False]]
dsets = Datasets(range(5), tfms=[None], splits=splits)

test_eq(dsets.train, [(1,),(2,),(4,)])
test_eq(dsets.valid, [(0,)])
# apply transforms to all items
tfm = [[lambda x: x*2,lambda x: x+1]]
splits = [[1,2],[0,3,4]]
dsets = Datasets(range(5), tfm, splits=splits)
test_eq(dsets.train,[(3,),(5,)])
test_eq(dsets.valid,[(1,),(7,),(9,)])
test_eq(dsets.train[False,True], [(5,)])
# only transform subset 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2
    def decodes(self, x): return TitledStr(x//2)
dsets = Datasets(range(5), [_Tfm()], splits=[[1,2],[0,3,4]])
test_eq(dsets.train,[(1,),(2,)])
test_eq(dsets.valid,[(0,),(6,),(8,)])
test_eq(dsets.train[False,True], [(2,)])
dsets
(#5) [(0,),(1,),(2,),(3,),(4,)]
#A context manager to change the split_idx and apply the validation transform on the training set
ds = dsets.train
with ds.set_split_idx(1):
    test_eq(ds,[(2,),(4,)])
test_eq(dsets.train,[(1,),(2,)])
dsets = Datasets(range(5), [_Tfm(),noop], splits=[[1,2],[0,3,4]])
test_eq(dsets.train,[(1,1),(2,2)])
test_eq(dsets.valid,[(0,0),(6,3),(8,4)])
start = torch.arange(0,50)
tds = Datasets(start, [A()])
tdl = TfmdDL(tds, after_item=NegTfm(), bs=4)
b = tdl.one_batch()
test_eq(tdl.decode_batch(b), ((0,),(1,),(2,),(3,)))
test_stdout(tdl.show_batch, "0\n1\n2\n3")
# only transform subset 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2

dsets = Datasets(range(8), [None], splits=[[1,2,5,7],[0,3,4,6]])
# only transform subset 1
class _Tfm(Transform):
    split_idx=1
    def encodes(self, x): return x*2

dsets = Datasets(range(8), [None], splits=[[1,2,5,7],[0,3,4,6]])
dls = dsets.dataloaders(bs=4, after_batch=_Tfm(), shuffle=False, device=torch.device('cpu'))
test_eq(dls.train, [(tensor([1,2,5, 7]),)])
test_eq(dls.valid, [(tensor([0,6,8,12]),)])
test_eq(dls.n_inp, 1)

Methods

items = [1,2,3,4]
dsets = Datasets(items, [[neg_tfm,int2f_tfm]])

source

Datasets.dataloaders


def dataloaders(
    bs:int=64, # Batch size
    shuffle_train:bool=None, # (Deprecated, use `shuffle`) Shuffle training [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    shuffle:bool=True, # Shuffle training [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    val_shuffle:bool=False, # Shuffle validation [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    n:int=None, # Size of [`Datasets`](https://docs.fast.ai/data.core.html#datasets) used to create [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    path:str | Path='.', # Path to put in [`DataLoaders`](https://docs.fast.ai/data.core.html#dataloaders)
    dl_type:TfmdDL=None, # Type of [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)
    dl_kwargs:list=None, # List of kwargs to pass to individual [`DataLoader`](https://docs.fast.ai/data.load.html#dataloader)s
    device:torch.device=None, # Device to put [`DataLoaders`](https://docs.fast.ai/data.core.html#dataloaders)
    drop_last:bool=None, # Drop last incomplete batch, defaults to `shuffle`
    val_bs:int=None, # Validation batch size, defaults to `bs`
    num_workers:int=None, # Number of CPU cores to use in parallel (default: All available up to 16)
    verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, indexed:NoneType=None,
    persistent_workers:bool=False, pin_memory_device:str='', wif:NoneType=None, before_iter:NoneType=None,
    after_item:NoneType=None, before_batch:NoneType=None, after_batch:NoneType=None, after_iter:NoneType=None,
    create_batches:NoneType=None, create_item:NoneType=None, create_batch:NoneType=None, retain:NoneType=None,
    get_idxs:NoneType=None, sample:NoneType=None, shuffle_fn:NoneType=None, do_batch:NoneType=None
)->DataLoaders:

Get a DataLoaders

Used to create dataloaders. You may prepend ‘val_’ as in val_shuffle to override functionality for the validation set. dl_kwargs gives finer per dataloader control if you need to work with more than one dataloader.


source

Datasets.decode


def decode(
    o, full:bool=True
):

Compose decode of all tuple_tfms then all tfms on i

test_eq(*dsets[0], -1)
test_eq(*dsets.decode((-1,)), 1)

source

Datasets.show


def show(
    o, ctx:NoneType=None, kwargs:VAR_KEYWORD
):

Show item o in ctx

test_stdout(lambda:dsets.show(dsets[1]), '-2')

source

Datasets.new_empty


def new_empty(
    
):

Create a new empty version of the self, keeping only the transforms

items = [1,2,3,4]
nrm = Norm()
dsets = Datasets(items, [[neg_tfm,int2f_tfm], [neg_tfm]])
empty = dsets.new_empty()
test_eq(empty.items, [])

Add test set for inference

# only transform subset 1
class _Tfm1(Transform):
    split_idx=0
    def encodes(self, x): return x*3

dsets = Datasets(range(8), [[_Tfm(),_Tfm1()]], splits=[[1,2,5,7],[0,3,4,6]])
test_eq(dsets.train, [(3,),(6,),(15,),(21,)])
test_eq(dsets.valid, [(0,),(6,),(8,),(12,)])

source

test_set


def test_set(
    dsets:Datasets | TfmdLists, # Map- or iterable-style dataset from which to load the data
    test_items, # Items in test dataset
    rm_tfms:NoneType=None, # Start index of `Transform`(s) from validation set in `dsets` to apply
    with_labels:bool=False, # Whether the test items contain labels
):

Create a test set from test_items using validation transforms of dsets

class _Tfm1(Transform):
    split_idx=0
    def encodes(self, x): return x*3

dsets = Datasets(range(8), [[_Tfm(),_Tfm1()]], splits=[[1,2,5,7],[0,3,4,6]])
test_eq(dsets.train, [(3,),(6,),(15,),(21,)])
test_eq(dsets.valid, [(0,),(6,),(8,),(12,)])

#Tranform of the validation set are applied
tst = test_set(dsets, [1,2,3])
test_eq(tst, [(2,),(4,),(6,)])

source

DataLoaders.test_dl


def test_dl(
    test_items, # Items in test dataset
    rm_type_tfms:NoneType=None, # Start index of `Transform`(s) from validation set in `dsets` to apply
    with_labels:bool=False, # Whether the test items contain labels
    bs:int=64, # Size of batch
    shuffle:bool=False, # Whether to shuffle data
    num_workers:int=None, # Number of CPU cores to use in parallel (default: All available up to 16)
    verbose:bool=False, # Whether to print verbose logs
    do_setup:bool=True, # Whether to run `setup()` for batch transform(s)
    pin_memory:bool=False, timeout:int=0, batch_size:NoneType=None, drop_last:bool=False, indexed:NoneType=None,
    n:NoneType=None, device:NoneType=None, persistent_workers:bool=False, pin_memory_device:str='',
    wif:NoneType=None, before_iter:NoneType=None, after_item:NoneType=None, before_batch:NoneType=None,
    after_batch:NoneType=None, after_iter:NoneType=None, create_batches:NoneType=None, create_item:NoneType=None,
    create_batch:NoneType=None, retain:NoneType=None, get_idxs:NoneType=None, sample:NoneType=None,
    shuffle_fn:NoneType=None, do_batch:NoneType=None
):

Create a test dataloader from test_items using validation transforms of dls

dsets = Datasets(range(8), [[_Tfm(),_Tfm1()]], splits=[[1,2,5,7],[0,3,4,6]])
dls = dsets.dataloaders(bs=4, device=torch.device('cpu'))
dsets = Datasets(range(8), [[_Tfm(),_Tfm1()]], splits=[[1,2,5,7],[0,3,4,6]])
dls = dsets.dataloaders(bs=4, device=torch.device('cpu'))
tst_dl = dls.test_dl([2,3,4,5])
test_eq(tst_dl._n_inp, 1)
test_eq(list(tst_dl), [(tensor([ 4,  6,  8, 10]),)])
#Test you can change transforms
tst_dl = dls.test_dl([2,3,4,5], after_item=add1)
test_eq(list(tst_dl), [(tensor([ 5,  7,  9, 11]),)])