Path: blob/master/labml_nn/utils/tokenizer.py
4910 views
from typing import Callable12from labml.configs import BaseConfigs, option345class TokenizerConfigs(BaseConfigs):6"""7<a id="TokenizerConfigs"></a>89## Tokenizer Configurations10"""1112tokenizer: Callable = 'character'1314def __init__(self):15super().__init__(_primary='tokenizer')161718@option(TokenizerConfigs.tokenizer)19def basic_english():20"""21### Basic english tokenizer2223We use character level tokenizer in this experiment.24You can switch by setting,2526```27'tokenizer': 'basic_english'28```2930in the configurations dictionary when starting the experiment.3132"""33from torchtext.data import get_tokenizer34return get_tokenizer('basic_english')353637def character_tokenizer(x: str):38"""39### Character level tokenizer40"""41return list(x)424344@option(TokenizerConfigs.tokenizer)45def character():46"""47Character level tokenizer configuration48"""49return character_tokenizer505152