CoCalc -- transform.py

GitHub Repository: jantic/deoldify
Path: blob/master/fastai/text/transform.py
⁸⁴⁰ views
1
"NLP data processing; tokenizes text and creates vocab indexes"
2
from ..torch_core import *
3

4
import spacy
5
from spacy.symbols import ORTH
6

7
__all__ = ['BaseTokenizer', 'SpacyTokenizer', 'Tokenizer', 'Vocab', 'fix_html', 'replace_all_caps', 'replace_rep', 'replace_wrep',
8
           'rm_useless_spaces', 'spec_add_spaces', 'BOS', 'EOS', 'FLD', 'UNK', 'PAD', 'TK_MAJ', 'TK_UP', 'TK_REP', 'TK_REP', 'TK_WREP',
9
           'deal_caps']
10

11
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
12
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
13
defaults.text_spec_tok = [UNK,PAD,BOS,EOS,FLD,TK_MAJ,TK_UP,TK_REP,TK_WREP]
14

15

16
class BaseTokenizer():
17
    "Basic class for a tokenizer function."
18
    def __init__(self, lang:str):                      self.lang = lang
19
    def tokenizer(self, t:str) -> List[str]:           return t.split(' ')
20
    def add_special_cases(self, toks:Collection[str]): pass
21

22
class SpacyTokenizer(BaseTokenizer):
23
    "Wrapper around a spacy tokenizer to make it a `BaseTokenizer`."
24
    def __init__(self, lang:str):
25
        self.tok = spacy.blank(lang, disable=["parser","tagger","ner"])
26

27
    def tokenizer(self, t:str) -> List[str]:
28
        return [t.text for t in self.tok.tokenizer(t)]
29

30
    def add_special_cases(self, toks:Collection[str]):
31
        for w in toks:
32
            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])
33

34
def spec_add_spaces(t:str) -> str:
35
    "Add spaces around / and # in `t`. \n"
36
    return re.sub(r'([/#\n])', r' \1 ', t)
37

38
def rm_useless_spaces(t:str) -> str:
39
    "Remove multiple spaces in `t`."
40
    return re.sub(' {2,}', ' ', t)
41

42
def replace_rep(t:str) -> str:
43
    "Replace repetitions at the character level in `t`."
44
    def _replace_rep(m:Collection[str]) -> str:
45
        c,cc = m.groups()
46
        return f' {TK_REP} {len(cc)+1} {c} '
47
    re_rep = re.compile(r'(\S)(\1{3,})')
48
    return re_rep.sub(_replace_rep, t)
49

50
def replace_wrep(t:str) -> str:
51
    "Replace word repetitions in `t`."
52
    def _replace_wrep(m:Collection[str]) -> str:
53
        c,cc = m.groups()
54
        return f' {TK_WREP} {len(cc.split())+1} {c} '
55
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
56
    return re_wrep.sub(_replace_wrep, t)
57

58
def fix_html(x:str) -> str:
59
    "List of replacements from html strings in `x`."
60
    re1 = re.compile(r'  +')
61
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
62
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
63
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
64
        ' @-@ ','-').replace(' @,@ ',',').replace('\\', ' \\ ')
65
    return re1.sub(' ', html.unescape(x))
66

67
def replace_all_caps(x:Collection[str]) -> Collection[str]:
68
    "Replace tokens in ALL CAPS in `x` by their lower version and add `TK_UP` before."
69
    res = []
70
    for t in x:
71
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
72
        else: res.append(t)
73
    return res
74

75
def deal_caps(x:Collection[str]) -> Collection[str]:
76
    "Replace all Capitalized tokens in `x` by their lower version and add `TK_MAJ` before."
77
    res = []
78
    for t in x:
79
        if t == '': continue
80
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
81
        res.append(t.lower())
82
    return res
83

84
defaults.text_pre_rules = [fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces]
85
defaults.text_post_rules = [replace_all_caps, deal_caps]
86

87
class Tokenizer():
88
    "Put together rules and a tokenizer function to tokenize text with multiprocessing."
89
    def __init__(self, tok_func:Callable=SpacyTokenizer, lang:str='en', pre_rules:ListRules=None,
90
                 post_rules:ListRules=None, special_cases:Collection[str]=None, n_cpus:int=None):
91
        self.tok_func,self.lang,self.special_cases = tok_func,lang,special_cases
92
        self.pre_rules  = ifnone(pre_rules,  defaults.text_pre_rules )
93
        self.post_rules = ifnone(post_rules, defaults.text_post_rules)
94
        self.special_cases = special_cases if special_cases else defaults.text_spec_tok
95
        self.n_cpus = ifnone(n_cpus, defaults.cpus)
96

97
    def __repr__(self) -> str:
98
        res = f'Tokenizer {self.tok_func.__name__} in {self.lang} with the following rules:\n'
99
        for rule in self.pre_rules: res += f' - {rule.__name__}\n'
100
        for rule in self.post_rules: res += f' - {rule.__name__}\n'
101
        return res
102

103
    def process_text(self, t:str, tok:BaseTokenizer) -> List[str]:
104
        "Process one text `t` with tokenizer `tok`."
105
        for rule in self.pre_rules: t = rule(t)
106
        toks = tok.tokenizer(t)
107
        for rule in self.post_rules: toks = rule(toks)
108
        return toks
109

110
    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
111
        "Process a list of `texts` in one process."
112
        tok = self.tok_func(self.lang)
113
        if self.special_cases: tok.add_special_cases(self.special_cases)
114
        return [self.process_text(str(t), tok) for t in texts]
115

116
    def process_all(self, texts:Collection[str]) -> List[List[str]]:
117
        "Process a list of `texts`."
118
        if self.n_cpus <= 1: return self._process_all_1(texts)
119
        with ProcessPoolExecutor(self.n_cpus) as e:
120
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])
121

122
class Vocab():
123
    "Contain the correspondence between numbers and tokens and numericalize."
124
    def __init__(self, itos:Collection[str]):
125
        self.itos = itos
126
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})
127

128
    def numericalize(self, t:Collection[str]) -> List[int]:
129
        "Convert a list of tokens `t` to their ids."
130
        return [self.stoi[w] for w in t]
131

132
    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
133
        "Convert a list of `nums` to their tokens."
134
        return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums]
135

136
    def __getstate__(self):
137
        return {'itos':self.itos}
138

139
    def __setstate__(self, state:dict):
140
        self.itos = state['itos']
141
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})
142

143
    def save(self, path):
144
        "Save `self.itos` in `path`"
145
        pickle.dump(self.itos, open(path, 'wb'))
146

147
    @classmethod
148
    def create(cls, tokens:Tokens, max_vocab:int, min_freq:int) -> 'Vocab':
149
        "Create a vocabulary from a set of `tokens`."
150
        freq = Counter(p for o in tokens for p in o)
151
        itos = [o for o,c in freq.most_common(max_vocab) if c >= min_freq]
152
        for o in reversed(defaults.text_spec_tok):
153
            if o in itos: itos.remove(o)
154
            itos.insert(0, o)
155
        itos = itos[:max_vocab]
156
        if len(itos) < max_vocab: #Make sure vocab size is a multiple of 8 for fast mixed precision training
157
            while len(itos)%8 !=0: itos.append('xxfake')
158
        return cls(itos)
159
    
160
    @classmethod
161
    def load(cls, path):
162
        "Load the `Vocab` contained in `path`"
163
        itos = pickle.load(open(path, 'rb'))
164
        return cls(itos)
165

166
Product

Resources

Company