Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
labmlai
GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/labml_nn/utils/tokenizer.py
4910 views
1
from typing import Callable
2
3
from labml.configs import BaseConfigs, option
4
5
6
class TokenizerConfigs(BaseConfigs):
7
"""
8
<a id="TokenizerConfigs"></a>
9
10
## Tokenizer Configurations
11
"""
12
13
tokenizer: Callable = 'character'
14
15
def __init__(self):
16
super().__init__(_primary='tokenizer')
17
18
19
@option(TokenizerConfigs.tokenizer)
20
def basic_english():
21
"""
22
### Basic english tokenizer
23
24
We use character level tokenizer in this experiment.
25
You can switch by setting,
26
27
```
28
'tokenizer': 'basic_english'
29
```
30
31
in the configurations dictionary when starting the experiment.
32
33
"""
34
from torchtext.data import get_tokenizer
35
return get_tokenizer('basic_english')
36
37
38
def character_tokenizer(x: str):
39
"""
40
### Character level tokenizer
41
"""
42
return list(x)
43
44
45
@option(TokenizerConfigs.tokenizer)
46
def character():
47
"""
48
Character level tokenizer configuration
49
"""
50
return character_tokenizer
51
52