from fastai.text.all import *
from nbdev.showdoc import show_doc
ULMFiT
Finetune a pretrained Language Model
First we get our data and tokenize it.
= untar_data(URLs.IMDB) path
= get_files(path, extensions=['.txt'], folders=['unsup', 'train', 'test'])
texts len(texts)
100000
Then we put it in a Datasets
. For a language model, we don’t have targets, so there is only one transform to numericalize the texts. Note that tokenize_df
returns the count of the words in the corpus to make it easy to create a vocabulary.
def read_file(f): return L(f.read_text().split(' '))
= RandomSplitter(valid_pct=0.1)(texts)
splits = [Tokenizer.from_folder(path), Numericalize()]
tfms = Datasets(texts, [tfms], splits=splits, dl_type=LMDataLoader) dsets
Then we use that Datasets
to create a DataLoaders
. Here the class of TfmdDL
we need to use is LMDataLoader
which will concatenate all the texts in a source (with a shuffle at each epoch for the training set), split it in bs
chunks then read continuously through it.
=256,80
bs,sl= dsets.dataloaders(bs=bs, seq_len=sl, val_bs=bs) dbunch_lm
dbunch_lm.show_batch()
text | text_ | |
---|---|---|
0 | ▁xxbos ▁xxmaj ▁this ▁is ▁an ▁xxmaj ▁emperor ' s ▁xxmaj ▁new ▁xxmaj ▁clothes ▁situation . ▁xxmaj ▁someone ▁needs ▁to ▁say ▁" that ' s ▁not ▁a ▁funny ▁and ▁original , ▁( etc . , ▁etc . ) ▁film ; ▁that ▁is ▁an ▁inferior ▁film . ▁xxmaj ▁don ' t ▁waste ▁your ▁money ▁on ▁it ." ▁xxmaj ▁the ▁film ▁is ▁trashy , ▁and ▁the ▁people ▁in ▁it ▁are ▁embarrassingly ▁inferior ▁trailer ▁trash . ▁xxmaj ▁they ▁are ▁all - too - realistic | ▁xxmaj ▁this ▁is ▁an ▁xxmaj ▁emperor ' s ▁xxmaj ▁new ▁xxmaj ▁clothes ▁situation . ▁xxmaj ▁someone ▁needs ▁to ▁say ▁" that ' s ▁not ▁a ▁funny ▁and ▁original , ▁( etc . , ▁etc . ) ▁film ; ▁that ▁is ▁an ▁inferior ▁film . ▁xxmaj ▁don ' t ▁waste ▁your ▁money ▁on ▁it ." ▁xxmaj ▁the ▁film ▁is ▁trashy , ▁and ▁the ▁people ▁in ▁it ▁are ▁embarrassingly ▁inferior ▁trailer ▁trash . ▁xxmaj ▁they ▁are ▁all - too - realistic ally |
1 | ▁xxmaj ▁listener ▁is ▁without ▁doubt ▁one ▁of ▁the ▁dullest ▁films ▁i ▁have ▁ever ▁seen . ▁xxmaj ▁there ▁was ▁nothing ▁happening ▁in ▁this ▁film ▁what ▁so ▁ever ▁- ▁i ▁didn ' t ▁care ▁for ▁any ▁of ▁the ▁characters , ▁didn ' t ▁buy ▁in ▁to ▁the ▁whole ▁mystery ▁type ▁plot , ▁didn ' t ▁care ▁how ▁it ▁ended ▁.... nothing . ▁xxmaj ▁there ▁is ▁no ▁comedy , ▁no ▁action , ▁no ▁thrills , ▁no ▁suspense , ▁nothing . ▁xxmaj ▁the ▁highlights | ▁listener ▁is ▁without ▁doubt ▁one ▁of ▁the ▁dullest ▁films ▁i ▁have ▁ever ▁seen . ▁xxmaj ▁there ▁was ▁nothing ▁happening ▁in ▁this ▁film ▁what ▁so ▁ever ▁- ▁i ▁didn ' t ▁care ▁for ▁any ▁of ▁the ▁characters , ▁didn ' t ▁buy ▁in ▁to ▁the ▁whole ▁mystery ▁type ▁plot , ▁didn ' t ▁care ▁how ▁it ▁ended ▁.... nothing . ▁xxmaj ▁there ▁is ▁no ▁comedy , ▁no ▁action , ▁no ▁thrills , ▁no ▁suspense , ▁nothing . ▁xxmaj ▁the ▁highlights ▁include |
2 | ▁mutated ▁humans ▁who ▁have ▁been ▁outcast ▁by ▁society . ▁xxmaj ▁eventually , ▁they ▁receive ▁a ▁special ▁en vo y ▁from ▁xxmaj ▁earth ▁with ▁an ▁unexpected ▁message . ▁xxmaj ▁the ▁basic ▁problem ▁is ▁that ▁this ▁whole ▁movie ▁could ▁have ▁been ▁summarized ▁into ▁a ▁sentence ▁and ▁making ▁a ▁1 ▁hour ▁movie ▁out ▁of ▁it ▁added ▁nothing . ▁xxmaj ▁what ▁you ▁essentially ▁get ▁is ▁some ▁effectively ▁gross - looking ▁characters ▁with ▁dialog ▁that ▁is ▁so ▁boring ▁you ▁want ▁to ▁blow ▁their ▁ship ▁up | ▁humans ▁who ▁have ▁been ▁outcast ▁by ▁society . ▁xxmaj ▁eventually , ▁they ▁receive ▁a ▁special ▁en vo y ▁from ▁xxmaj ▁earth ▁with ▁an ▁unexpected ▁message . ▁xxmaj ▁the ▁basic ▁problem ▁is ▁that ▁this ▁whole ▁movie ▁could ▁have ▁been ▁summarized ▁into ▁a ▁sentence ▁and ▁making ▁a ▁1 ▁hour ▁movie ▁out ▁of ▁it ▁added ▁nothing . ▁xxmaj ▁what ▁you ▁essentially ▁get ▁is ▁some ▁effectively ▁gross - looking ▁characters ▁with ▁dialog ▁that ▁is ▁so ▁boring ▁you ▁want ▁to ▁blow ▁their ▁ship ▁up ▁every |
3 | ▁soul ▁of ▁man ▁can ▁be ▁distorted ▁in ▁such ▁a ▁way ▁that , ▁pain ▁and ▁suffering ▁being ▁brought ▁to ▁bear ▁on ▁a ▁fellow ▁human ▁being ▁is ▁in ▁some ▁way ▁satisfying . ▁xxmaj ▁be ▁it ▁mental ▁or ▁physical . ▁i ▁found ▁the ▁film ▁very ▁thought ▁provoking . ▁xxbos ▁i ▁have ▁been ▁hooked ▁on ▁" gg " ▁since ▁midway ▁through ▁2001 - 2002 ▁(2 nd ▁season ), ▁when ▁i ▁tuned ▁in ▁to ▁see ▁" small ville " ▁10 ▁minutes ▁early . ▁xxmaj ▁thanks | ▁of ▁man ▁can ▁be ▁distorted ▁in ▁such ▁a ▁way ▁that , ▁pain ▁and ▁suffering ▁being ▁brought ▁to ▁bear ▁on ▁a ▁fellow ▁human ▁being ▁is ▁in ▁some ▁way ▁satisfying . ▁xxmaj ▁be ▁it ▁mental ▁or ▁physical . ▁i ▁found ▁the ▁film ▁very ▁thought ▁provoking . ▁xxbos ▁i ▁have ▁been ▁hooked ▁on ▁" gg " ▁since ▁midway ▁through ▁2001 - 2002 ▁(2 nd ▁season ), ▁when ▁i ▁tuned ▁in ▁to ▁see ▁" small ville " ▁10 ▁minutes ▁early . ▁xxmaj ▁thanks ▁to |
4 | ▁came ▁to ▁the ▁cinemas , ▁and ▁brought ▁my ▁entire ▁family ▁along . ▁i ▁had ▁already ▁seen ▁xxmaj ▁jim ▁xxmaj ▁carrey ▁in ▁the ▁xxmaj ▁mask , ▁and ▁xxmaj ▁ace ▁xxmaj ▁ventura , ▁and ▁loved ▁him ▁in ▁those ▁films . ▁xxmaj ▁the ▁review ▁of ▁the ▁film ▁was ▁quite ▁good , ▁so ▁i ▁looked ▁forward ▁to ▁this . ▁xxmaj ▁my ▁father ▁wondered ▁if ▁it ▁really ▁was ▁a ▁movie ▁in ▁his ▁taste ▁... ▁and ▁then ▁the ▁movie ▁started . ▁i ▁have ▁never ▁in ▁my | ▁to ▁the ▁cinemas , ▁and ▁brought ▁my ▁entire ▁family ▁along . ▁i ▁had ▁already ▁seen ▁xxmaj ▁jim ▁xxmaj ▁carrey ▁in ▁the ▁xxmaj ▁mask , ▁and ▁xxmaj ▁ace ▁xxmaj ▁ventura , ▁and ▁loved ▁him ▁in ▁those ▁films . ▁xxmaj ▁the ▁review ▁of ▁the ▁film ▁was ▁quite ▁good , ▁so ▁i ▁looked ▁forward ▁to ▁this . ▁xxmaj ▁my ▁father ▁wondered ▁if ▁it ▁really ▁was ▁a ▁movie ▁in ▁his ▁taste ▁... ▁and ▁then ▁the ▁movie ▁started . ▁i ▁have ▁never ▁in ▁my ▁movie |
5 | ▁or ▁xxmaj ▁ladd ▁and ▁xxmaj ▁lake ) to ▁soften ▁the ▁violent ▁elements . ▁" kiss ▁xxmaj ▁tomorrow ▁xxmaj ▁goodbye " ▁( a ▁gem ▁of ▁a ▁noir ▁title ▁if ▁there ▁ever ▁was ▁one , ▁be speaking ▁a ▁bleak , ▁fatalistic ▁vision ) ▁contains ▁no ▁romantic ▁subplot , ▁unless ▁you ▁count ▁the ▁xxmaj ▁cagney ▁character ' s ▁involvement ▁with ▁the ▁rich ▁woman , ▁which ▁is ▁more ▁about ▁greed ▁and ▁rebellion ▁than ▁love . ▁xxmaj ▁this ▁film ▁is ▁a ▁great ▁of ▁example ▁of | ▁xxmaj ▁ladd ▁and ▁xxmaj ▁lake ) to ▁soften ▁the ▁violent ▁elements . ▁" kiss ▁xxmaj ▁tomorrow ▁xxmaj ▁goodbye " ▁( a ▁gem ▁of ▁a ▁noir ▁title ▁if ▁there ▁ever ▁was ▁one , ▁be speaking ▁a ▁bleak , ▁fatalistic ▁vision ) ▁contains ▁no ▁romantic ▁subplot , ▁unless ▁you ▁count ▁the ▁xxmaj ▁cagney ▁character ' s ▁involvement ▁with ▁the ▁rich ▁woman , ▁which ▁is ▁more ▁about ▁greed ▁and ▁rebellion ▁than ▁love . ▁xxmaj ▁this ▁film ▁is ▁a ▁great ▁of ▁example ▁of ▁pure |
6 | ▁xxmaj ▁swann ▁( o ' toole ) ▁out ▁of ▁trouble ▁ . ▁xxmaj ▁by ▁no ▁means ▁an ▁easy ▁task . ▁xxmaj ▁the ▁things ▁they ▁do ▁together ▁are ▁quite ▁out ▁of ▁the ▁ordinary . ▁xxmaj ▁though ▁he ▁is ▁in ▁charge ▁of ▁keeping ▁xxmaj ▁swann ▁out ▁of ▁trouble , ▁he ▁really ▁gets ▁to ▁enjoy ▁his ▁job . ▁xxmaj ▁this ▁film ▁also ▁show ▁the ▁side ▁of ▁the ▁entertainment ▁history ▁we ▁don ' t ▁see , ▁xxmaj ▁swann ▁( o ' toole ▁ ) | ▁swann ▁( o ' toole ) ▁out ▁of ▁trouble ▁ . ▁xxmaj ▁by ▁no ▁means ▁an ▁easy ▁task . ▁xxmaj ▁the ▁things ▁they ▁do ▁together ▁are ▁quite ▁out ▁of ▁the ▁ordinary . ▁xxmaj ▁though ▁he ▁is ▁in ▁charge ▁of ▁keeping ▁xxmaj ▁swann ▁out ▁of ▁trouble , ▁he ▁really ▁gets ▁to ▁enjoy ▁his ▁job . ▁xxmaj ▁this ▁film ▁also ▁show ▁the ▁side ▁of ▁the ▁entertainment ▁history ▁we ▁don ' t ▁see , ▁xxmaj ▁swann ▁( o ' toole ▁ ) ▁confesses |
7 | , ▁it ▁just ▁keeps ▁plodding ▁on . ▁xxmaj ▁christopher ▁xxmaj ▁walken ▁has ▁a ▁part , ▁but ▁it ▁is ▁completely ▁senseless , ▁as ▁is ▁most ▁of ▁the ▁movie . ▁xxmaj ▁this ▁movie ▁had ▁potential , ▁but ▁it ▁looks ▁like ▁some ▁really ▁bad ▁made ▁for ▁xxup ▁tv ▁movie . ▁i ▁would ▁avoid ▁this ▁movie . ▁xxbos ▁" men ace " ▁is ▁not ▁funny . ▁xxmaj ▁it ▁tries ▁hard ▁- ▁too ▁hard . ▁but ▁rarely ▁brings ▁a ▁smile . ▁xxmaj ▁there ▁is ▁no | ▁it ▁just ▁keeps ▁plodding ▁on . ▁xxmaj ▁christopher ▁xxmaj ▁walken ▁has ▁a ▁part , ▁but ▁it ▁is ▁completely ▁senseless , ▁as ▁is ▁most ▁of ▁the ▁movie . ▁xxmaj ▁this ▁movie ▁had ▁potential , ▁but ▁it ▁looks ▁like ▁some ▁really ▁bad ▁made ▁for ▁xxup ▁tv ▁movie . ▁i ▁would ▁avoid ▁this ▁movie . ▁xxbos ▁" men ace " ▁is ▁not ▁funny . ▁xxmaj ▁it ▁tries ▁hard ▁- ▁too ▁hard . ▁but ▁rarely ▁brings ▁a ▁smile . ▁xxmaj ▁there ▁is ▁no ▁acting |
8 | ▁pink ▁xxmaj ▁panther . ▁xxmaj ▁this ▁was ▁an ▁xxup ▁rko ▁movie ▁but ▁it ▁did ▁not ▁have ▁the ▁nice ▁airplane ▁logo ▁that ▁xxup ▁rko ▁used ▁to ▁use . ▁i ▁liked ▁xxmaj ▁victor ▁xxmaj ▁mature ▁in ▁xxmaj ▁one ▁xxmaj ▁million , ▁xxup ▁b . c . , ▁and ▁xxmaj ▁sam p son ▁and ▁xxmaj ▁delilah ▁and ▁especially ▁in ▁xxmaj ▁violent ▁xxmaj ▁saturday . ▁xxmaj ▁see ▁if ▁you ▁can ▁find ▁that ▁one . ▁xxmaj ▁he ▁was ▁wonderful ▁in ▁the ▁comedy ▁with ▁xxmaj ▁peter | ▁xxmaj ▁panther . ▁xxmaj ▁this ▁was ▁an ▁xxup ▁rko ▁movie ▁but ▁it ▁did ▁not ▁have ▁the ▁nice ▁airplane ▁logo ▁that ▁xxup ▁rko ▁used ▁to ▁use . ▁i ▁liked ▁xxmaj ▁victor ▁xxmaj ▁mature ▁in ▁xxmaj ▁one ▁xxmaj ▁million , ▁xxup ▁b . c . , ▁and ▁xxmaj ▁sam p son ▁and ▁xxmaj ▁delilah ▁and ▁especially ▁in ▁xxmaj ▁violent ▁xxmaj ▁saturday . ▁xxmaj ▁see ▁if ▁you ▁can ▁find ▁that ▁one . ▁xxmaj ▁he ▁was ▁wonderful ▁in ▁the ▁comedy ▁with ▁xxmaj ▁peter ▁xxmaj |
Then we have a convenience method to directly grab a Learner
from it, using the AWD_LSTM
architecture.
= partial(Adam, wd=0.1)
opt_func = language_model_learner(dbunch_lm, AWD_LSTM, opt_func=opt_func, metrics=[accuracy, Perplexity()], path=path)
learn = learn.to_fp16() learn
1, 2e-2, moms=(0.8,0.7,0.8)) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | perplexity | time |
---|---|---|---|---|---|
0 | 4.426135 | 3.984901 | 0.292371 | 53.779987 | 07:00 |
'stage1') learn.save(
'stage1'); learn.load(
learn.unfreeze()10, 2e-3, moms=(0.8,0.7,0.8)) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | perplexity | time |
---|---|---|---|---|---|
0 | 4.163227 | 3.870354 | 0.306840 | 47.959347 | 07:24 |
1 | 4.055693 | 3.790802 | 0.316436 | 44.291908 | 07:41 |
2 | 3.979279 | 3.729021 | 0.323357 | 41.638317 | 07:22 |
3 | 3.919654 | 3.688891 | 0.327770 | 40.000469 | 07:22 |
4 | 3.889432 | 3.660633 | 0.330762 | 38.885933 | 07:22 |
5 | 3.842923 | 3.637397 | 0.333315 | 37.992798 | 07:26 |
6 | 3.813823 | 3.619074 | 0.335308 | 37.303013 | 07:25 |
7 | 3.793213 | 3.608010 | 0.336566 | 36.892574 | 07:20 |
8 | 3.766456 | 3.602140 | 0.337257 | 36.676647 | 07:22 |
9 | 3.759768 | 3.600955 | 0.337450 | 36.633202 | 07:23 |
Once we have fine-tuned the pretrained language model to this corpus, we save the encoder since we will use it for the classifier.
'finetuned1') learn.save_encoder(
Use it to train a classifier
= get_files(path, extensions=['.txt'], folders=['train', 'test']) texts
= GrandparentSplitter(valid_name='test')(texts) splits
For classification, we need to use two set of transforms: one to numericalize the texts and the other to encode the labels as categories.
= [Tokenizer.from_folder(path), Numericalize(vocab=dbunch_lm.vocab)]
x_tfms = Datasets(texts, [x_tfms, [parent_label, Categorize()]], splits=splits, dl_type=SortedDL) dsets
= 64 bs
= dsets.dataloaders(before_batch=pad_input_chunk, bs=bs) dls
=2) dls.show_batch(max_n
text | category | |
---|---|---|
0 | xxbos * * attention xxmaj spoilers * * \n\n xxmaj first of all , let me say that xxmaj rob xxmaj roy is one of the best films of the 90 's . xxmaj it was an amazing achievement for all those involved , especially the acting of xxmaj liam xxmaj neeson , xxmaj jessica xxmaj lange , xxmaj john xxmaj hurt , xxmaj brian xxmaj cox , and xxmaj tim xxmaj roth . xxmaj michael xxmaj canton xxmaj jones painted a wonderful portrait of the honor and dishonor that men can represent in themselves . xxmaj but alas … \n\n it constantly , and unfairly gets compared to " braveheart " . xxmaj these are two entirely different films , probably only similar in the fact that they are both about xxmaj scots in historical xxmaj scotland . xxmaj yet , this comparison frequently bothers me because it seems | pos |
1 | xxbos xxmaj by now you 've probably heard a bit about the new xxmaj disney dub of xxmaj miyazaki 's classic film , xxmaj laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky . xxmaj during late summer of 1998 , xxmaj disney released " kiki 's xxmaj delivery xxmaj service " on video which included a preview of the xxmaj laputa dub saying it was due out in " 1 xxrep 3 9 " . xxmaj it 's obviously way past that year now , but the dub has been finally completed . xxmaj and it 's not " laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky " , just " castle xxmaj in xxmaj the xxmaj sky " for the dub , since xxmaj laputa is not such a nice word in xxmaj spanish ( even though they use the word xxmaj laputa many times | pos |
Then we once again have a convenience function to create a classifier from this DataLoaders
with the AWD_LSTM
architecture.
= partial(Adam, wd=0.1)
opt_func = text_classifier_learner(dls, AWD_LSTM, metrics=[accuracy], path=path, drop_mult=0.5, opt_func=opt_func) learn
We load our pretrained encoder.
= learn.load_encoder('finetuned1')
learn = learn.to_fp16(clip=0.1) learn
Then we can train with gradual unfreezing and differential learning rates.
= 1e-1 * bs/128 lr
1, lr, moms=(0.8,0.7,0.8), wd=0.1) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.328318 | 0.200650 | 0.922120 | 01:08 |
-2)
learn.freeze_to(/= 2
lr 1, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.208120 | 0.166004 | 0.937440 | 01:15 |
-3)
learn.freeze_to(/= 2
lr 1, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.162498 | 0.154959 | 0.942400 | 01:35 |
learn.unfreeze()/= 5
lr 2, slice(lr/(2.6**4),lr), moms=(0.8,0.7,0.8), wd=0.1) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.133800 | 0.163456 | 0.940560 | 01:34 |
1 | 0.095326 | 0.154301 | 0.945120 | 01:34 |