'#fastai'), ' # fastai')
test_eq(spec_add_spaces('/fastai'), ' / fastai')
test_eq(spec_add_spaces('\\fastai'), ' \\ fastai') test_eq(spec_add_spaces(
Text core
DataLoaders
.
Preprocessing rules
The following are rules applied to texts before or after it’s tokenized.
spec_add_spaces
spec_add_spaces (t)
Add spaces around / and #
rm_useless_spaces
rm_useless_spaces (t)
Remove multiple spaces
'a b c'), 'a b c') test_eq(rm_useless_spaces(
replace_rep
replace_rep (t)
Replace repetitions at the character level: cccc – TK_REP 4 c
It starts replacing at 3 repetitions of the same character or more.
'aa'), 'aa')
test_eq(replace_rep('aaaa'), f' {TK_REP} 4 a ') test_eq(replace_rep(
replace_wrep
replace_wrep (t)
Replace word repetitions: word word word word – TK_WREP 4 word
It starts replacing at 3 repetitions of the same word or more.
'ah ah'), 'ah ah')
test_eq(replace_wrep('ah ah ah'), f' {TK_WREP} 3 ah ')
test_eq(replace_wrep('ah ah ah ah'), f' {TK_WREP} 4 ah ')
test_eq(replace_wrep('ah ah ah ah '), f' {TK_WREP} 4 ah ')
test_eq(replace_wrep('ah ah ah ah.'), f' {TK_WREP} 4 ah .')
test_eq(replace_wrep('ah ah ahi'), f'ah ah ahi') test_eq(replace_wrep(
fix_html
fix_html (x)
Various messy things we’ve seen in documents
'#39;bli#146;'), "'bli'")
test_eq(fix_html('Sarah amp; Duck...'), 'Sarah & Duck …')
test_eq(fix_html('a nbsp; #36;'), 'a $')
test_eq(fix_html('\\" <unk>'), f'" {UNK}')
test_eq(fix_html('quot; @.@ @-@ '), "' .-")
test_eq(fix_html('<br />text\\n'), '\ntext\n') test_eq(fix_html(
replace_all_caps
replace_all_caps (t)
Replace tokens in ALL CAPS by their lower version and add TK_UP
before.
"I'M SHOUTING"), f"{TK_UP} i'm {TK_UP} shouting")
test_eq(replace_all_caps("I'm speaking normally"), "I'm speaking normally")
test_eq(replace_all_caps("I am speaking normally"), "i am speaking normally") test_eq(replace_all_caps(
replace_maj
replace_maj (t)
Replace tokens in Sentence Case by their lower version and add TK_MAJ
before.
"Jeremy Howard"), f'{TK_MAJ} jeremy {TK_MAJ} howard')
test_eq(replace_maj("I don't think there is any maj here"), ("i don't think there is any maj here"),) test_eq(replace_maj(
lowercase
lowercase (t, add_bos=True, add_eos=False)
Converts t
to lowercase
replace_space
replace_space (t)
Replace embedded spaces in a token with unicode line char to allow for split/join
Tokenizing
A tokenizer is a class that must implement __call__
. This method receives a iterator of texts and must return a generator with their tokenized versions. Here is the most basic example:
BaseTokenizer
BaseTokenizer (split_char=' ', **kwargs)
Basic tokenizer that just splits on spaces
= BaseTokenizer()
tok "This is a text"]), [["This", "is", "a", "text"]])
test_eq(tok([= BaseTokenizer('x')
tok "This is a text"]), [["This is a te", "t"]]) test_eq(tok([
SpacyTokenizer
SpacyTokenizer (lang='en', special_toks=None, buf_sz=5000)
Spacy tokenizer for lang
= SpacyTokenizer()
tok = "This isn't the easiest text.",["This", "is", "n't", "the", "easiest", "text", "."]
inp,exp test_eq(L(tok([inp,inp])), [exp,exp])
TokenizeWithRules
TokenizeWithRules (tok, rules=None, post_rules=None)
A wrapper around tok
which applies rules
, then tokenizes, then applies post_rules
= TokenizeWithRules(BaseTokenizer(),rules=[replace_all_caps])
f "THIS isn't a problem"]), [[TK_UP, 'this', "isn't", 'a', 'problem']])
test_eq(f([= TokenizeWithRules(SpacyTokenizer())
f "This isn't a problem"]), [[BOS, TK_MAJ, 'this', 'is', "n't", 'a', 'problem']])
test_eq(f([= TokenizeWithRules(BaseTokenizer(split_char="'"), rules=[])
f "This isn't a problem"]), [['This▁isn', 't▁a▁problem']]) test_eq(f([
The main function that will be called during one of the processes handling tokenization. It will iterate through the batch
of texts, apply them rules
and tokenize them.
= ["this is a text", "this is another text"]
texts = TokenizeWithRules(BaseTokenizer(), texts.__getitem__)
tok 0,1]), [['this', 'is', 'a', 'text'],['this', 'is', 'another', 'text']]) test_eq(tok([
tokenize1
tokenize1 (text, tok, rules=None, post_rules=None)
Call TokenizeWithRules
with a single text
"This isn't a problem", SpacyTokenizer()),
test_eq(tokenize1('this', 'is', "n't", 'a', 'problem'])
[BOS, TK_MAJ, "This isn't a problem", tok=BaseTokenizer(), rules=[]),
test_eq(tokenize1('This',"isn't",'a','problem']) [
parallel_tokenize
parallel_tokenize (items, tok=None, rules=None, n_workers=4, **kwargs)
Calls optional setup
on tok
before launching TokenizeWithRules
using `parallel_gen
Note that since this uses parallel_gen
behind the scenes, the generator returned contains tuples of indices and results. There is no guarantee that the results are returned in order, so you should sort by the first item of the tuples (the indices) if you need them ordered.
= parallel_tokenize(['0 1', '1 2'], rules=[], n_workers=2)
res = zip(*L(res).sorted(itemgetter(0)))
idxs,toks '0','1'],['1','2']]) test_eq(toks, [[
Tokenize texts in files
Preprocessing function for texts in filenames. Tokenized texts will be saved in a similar fashion in a directory suffixed with _tok
in the parent folder of path
(override with output_dir
). This directory is the return value.
tokenize_folder
tokenize_folder (path, extensions=None, folders=None, output_dir=None, skip_if_exists=True, output_names=None, n_workers=4, rules=None, tok=None, encoding='utf8')
Tokenize text files in path
in parallel using n_workers
The result will be in output_dir
(defaults to a folder in the same parent directory as path
, with _tok
added to path.name
) with the same structure as in path
. Tokenized texts for a given file will be in the file having the same name in output_dir
. Additionally, a file with a .len suffix contains the number of tokens and the count of all words is stored in output_dir/counter.pkl
.
extensions
will default to ['.txt']
and all text files in path
are treated unless you specify a list of folders in include
. rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer.
tokenize_files
tokenize_files (files, path, output_dir, output_names=None, n_workers=4, rules=None, tok=None, encoding='utf8', skip_if_exists=False)
Tokenize text files
in parallel using n_workers
Tokenize texts in a dataframe
tokenize_texts
tokenize_texts (texts, n_workers=4, rules=None, tok=None)
Tokenize texts
in parallel using n_workers
tokenize_df
tokenize_df (df, text_cols, n_workers=4, rules=None, mark_fields=None, tok=None, tok_text_col='text')
Tokenize texts in df[text_cols]
in parallel using n_workers
and stores them in df[tok_text_col]
This function returns a new dataframe with the same non-text columns, a column named text that contains the tokenized texts and a column named text_lengths that contains their respective length. It also returns a counter of all seen words to quickly build a vocabulary afterward.
rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer. If mark_fields
isn’t specified, it defaults to False
when there is a single text column, True
when there are several. In that case, the texts in each of those columns are joined with FLD
markers followed by the number of the field.
tokenize_csv
tokenize_csv (fname, text_cols, outname=None, n_workers=4, rules=None, mark_fields=None, tok=None, header='infer', chunksize=50000)
Tokenize texts in the text_cols
of the csv fname
in parallel using n_workers
load_tokenized_csv
load_tokenized_csv (fname)
Utility function to quickly load a tokenized csv ans the corresponding counter
The result will be written in a new csv file in outname
(defaults to the same as fname
with the suffix _tok.csv
) and will have the same header as the original file, the same non-text columns, a text and a text_lengths column as described in tokenize_df
.
rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer. If mark_fields
isn’t specified, it defaults to False
when there is a single text column, True
when there are several. In that case, the texts in each of those columns are joined with FLD
markers followed by the number of the field.
The csv file is opened with header
and optionally with blocks of chunksize
at a time. If this argument is passed, each chunk is processed independently and saved in the output file to save memory usage.
def _prepare_texts(tmp_d):
"Prepare texts in a folder struct in tmp_d, a csv file and returns a dataframe"
= Path(tmp_d)/'tmp'
path
path.mkdir()for d in ['a', 'b', 'c']:
/d).mkdir()
(pathfor i in range(5):
with open(path/d/f'text{i}.txt', 'w') as f: f.write(f"This is an example of text {d} {i}")
= [f"This is an example of text {d} {i}" for i in range(5) for d in ['a', 'b', 'c']]
texts = pd.DataFrame({'text': texts, 'label': list(range(15))}, columns=['text', 'label'])
df = tmp_d/'input.csv'
csv_fname =False)
df.to_csv(csv_fname, indexreturn path,df,csv_fname
Tokenizer
-
Tokenizer
Tokenizer (tok, rules=None, counter=None, lengths=None, mode=None, sep=' ')
Provides a consistent Transform
interface to tokenizers operating on DataFrame
s and folders
with tempfile.TemporaryDirectory() as tmp_d:
= _prepare_texts(Path(tmp_d))
path,df,csv_fname = get_text_files(path)
items = RandomSplitter()(items)
splits = Datasets(items, [Tokenizer.from_folder(path)], splits=splits)
dsets print(dsets.train[0])
= Datasets(df, [Tokenizer.from_df('text')], splits=splits)
dsets print(dsets.train[0][0].text)
(['xxbos', 'xxmaj', 'this', 'is', 'an', 'example', 'of', 'text', 'b', '0'],)
('xxbos', 'xxmaj', 'this', 'is', 'an', 'example', 'of', 'text', 'c', '3')
= test_set(dsets, ['This is a test', 'this is another test'])
tst 'xxbos', 'xxmaj', 'this','is','a','test'],),
test_eq(tst, [(['xxbos','this','is','another','test'],)]) ([
Sentencepiece
SentencePieceTokenizer
SentencePieceTokenizer (lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp')
SentencePiece tokenizer for lang
= [f"This is an example of text {i}" for i in range(10)]
texts = pd.DataFrame({'text': texts, 'label': list(range(10))}, columns=['text', 'label'])
df = tokenize_df(df, text_cols='text', tok=SentencePieceTokenizer(vocab_sz=34), n_workers=1) out,cnt
with tempfile.TemporaryDirectory() as tmp_d:
= _prepare_texts(Path(tmp_d))
path,df,csv_fname = get_text_files(path)
items = RandomSplitter()(items)
splits = SentencePieceTokenizer(special_toks=[])
tok = Datasets(items, [Tokenizer.from_folder(path, tok=tok)], splits=splits)
dsets print(dsets.train[0][0])
with warnings.catch_warnings():
= Datasets(df, [Tokenizer.from_df('text', tok=tok)], splits=splits)
dsets print(dsets.train[0][0].text)
['▁xx', 'b', 'o', 's', '▁xx', 'm', 'a', 'j', '▁t', 'h', 'i', 's', '▁', 'i', 's', '▁a', 'n', '▁', 'ex', 'a', 'm', 'p', 'l', 'e', '▁', 'o', 'f', '▁t', 'ex', 't', '▁', 'b', '▁', '2']
['▁xx', 'b', 'o', 's', '▁xx', 'm', 'a', 'j', '▁t', 'h', 'i', 's', '▁', 'i', 's', '▁a', 'n', '▁', 'ex', 'a', 'm', 'p', 'l', 'e', '▁', 'o', 'f', '▁t', 'ex', 't', '▁a', '▁', '4']
/home/jhoward/miniconda3/lib/python3.8/site-packages/numpy/core/_asarray.py:102: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
return array(a, dtype, copy=False, order=order)