CoCalc -- data

GitHub Repository: jantic/deoldify
Path: blob/master/fastai/data_block.py
⁸⁴⁰ views
1
from .torch_core import *
2
from .basic_data import *
3
from .layers import *
4
from numbers import Integral
5

6
__all__ = ['ItemList', 'CategoryList', 'MultiCategoryList', 'MultiCategoryProcessor', 'LabelList', 'ItemLists', 'get_files',
7
           'PreProcessor', 'LabelLists', 'FloatList', 'CategoryProcessor', 'EmptyLabelList', 'MixedItem', 'MixedProcessor',
8
           'MixedItemList']
9

10
def _decode(df):
11
    return np.array([[df.columns[i] for i,t in enumerate(x) if t==1] for x in df.values], dtype=np.object)
12

13
def _maybe_squeeze(arr): return (arr if is1d(arr) else np.squeeze(arr))
14

15
def _path_to_same_str(p_fn):
16
    "path -> str, but same on nt+posix, for alpha-sort only"
17
    s_fn = str(p_fn)
18
    s_fn = s_fn.replace('\\','.')
19
    s_fn = s_fn.replace('/','.')
20
    return s_fn
21

22
def _get_files(parent, p, f, extensions):
23
    p = Path(p)#.relative_to(parent)
24
    if isinstance(extensions,str): extensions = [extensions]
25
    low_extensions = [e.lower() for e in extensions] if extensions is not None else None
26
    res = [p/o for o in f if not o.startswith('.')
27
           and (extensions is None or f'.{o.split(".")[-1].lower()}' in low_extensions)]
28
    return res
29

30
def get_files(path:PathOrStr, extensions:Collection[str]=None, recurse:bool=False,
31
              include:Optional[Collection[str]]=None, presort:bool=False)->FilePathList:
32
    "Return list of files in `path` that have a suffix in `extensions`; optionally `recurse`."
33
    if recurse:
34
        res = []
35
        for i,(p,d,f) in enumerate(os.walk(path)):
36
            # skip hidden dirs
37
            if include is not None and i==0:  d[:] = [o for o in d if o in include]
38
            else:                             d[:] = [o for o in d if not o.startswith('.')]
39
            res += _get_files(path, p, f, extensions)
40
        if presort: res = sorted(res, key=lambda p: _path_to_same_str(p), reverse=False)
41
        return res
42
    else:
43
        f = [o.name for o in os.scandir(path) if o.is_file()]
44
        res = _get_files(path, path, f, extensions)
45
        if presort: res = sorted(res, key=lambda p: _path_to_same_str(p), reverse=False)
46
        return res
47

48
class PreProcessor():
49
    "Basic class for a processor that will be applied to items at the end of the data block API."
50
    def __init__(self, ds:Collection=None):  self.ref_ds = ds
51
    def process_one(self, item:Any):         return item
52
    def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])
53

54
PreProcessors = Union[PreProcessor, Collection[PreProcessor]]
55
fastai_types[PreProcessors] = 'PreProcessors'
56

57
class ItemList():
58
    "A collection of items with `__len__` and `__getitem__` with `ndarray` indexing semantics."
59
    _bunch,_processor,_label_cls,_square_show,_square_show_res = DataBunch,None,None,False,False
60

61
    def __init__(self, items:Iterator, path:PathOrStr='.', label_cls:Callable=None, inner_df:Any=None,
62
                 processor:PreProcessors=None, x:'ItemList'=None, ignore_empty:bool=False):
63
        self.path = Path(path)
64
        self.num_parts = len(self.path.parts)
65
        self.items,self.x,self.ignore_empty = items,x,ignore_empty
66
        if not isinstance(self.items,np.ndarray): self.items = array(self.items, dtype=object)
67
        self.label_cls,self.inner_df,self.processor = ifnone(label_cls,self._label_cls),inner_df,processor
68
        self._label_list,self._split = LabelList,ItemLists
69
        self.copy_new = ['x', 'label_cls', 'path']
70

71
    def __len__(self)->int: return len(self.items) or 1
72
    def get(self, i)->Any:
73
        "Subclass if you want to customize how to create item `i` from `self.items`."
74
        return self.items[i]
75
    def __repr__(self)->str:
76
        items = [self[i] for i in range(min(5,len(self.items)))]
77
        return f'{self.__class__.__name__} ({len(self.items)} items)\n{show_some(items)}\nPath: {self.path}'
78

79
    def process(self, processor:PreProcessors=None):
80
        "Apply `processor` or `self.processor` to `self`."
81
        if processor is not None: self.processor = processor
82
        self.processor = listify(self.processor)
83
        for p in self.processor: p.process(self)
84
        return self
85

86
    def process_one(self, item:ItemBase, processor:PreProcessors=None):
87
        "Apply `processor` or `self.processor` to `item`."
88
        if processor is not None: self.processor = processor
89
        self.processor = listify(self.processor)
90
        for p in self.processor: item = p.process_one(item)
91
        return item
92

93
    def analyze_pred(self, pred:Tensor):
94
        "Called on `pred` before `reconstruct` for additional preprocessing."
95
        return pred
96

97
    def reconstruct(self, t:Tensor, x:Tensor=None):
98
        "Reconstruct one of the underlying item for its data `t`."
99
        return self[0].reconstruct(t,x) if has_arg(self[0].reconstruct, 'x') else self[0].reconstruct(t)
100

101
    def new(self, items:Iterator, processor:PreProcessors=None, **kwargs)->'ItemList':
102
        "Create a new `ItemList` from `items`, keeping the same attributes."
103
        processor = ifnone(processor, self.processor)
104
        copy_d = {o:getattr(self,o) for o in self.copy_new}
105
        kwargs = {**copy_d, **kwargs}
106
        return self.__class__(items=items, processor=processor, **kwargs)
107

108
    def add(self, items:'ItemList'):
109
        self.items = np.concatenate([self.items, items.items], 0)
110
        if self.inner_df is not None and items.inner_df is not None:
111
            self.inner_df = pd.concat([self.inner_df, items.inner_df])
112
        else: self.inner_df = self.inner_df or items.inner_df
113
        return self
114

115
    def __getitem__(self,idxs:int)->Any:
116
        "returns a single item based if `idxs` is an integer or a new `ItemList` object if `idxs` is a range."
117
        idxs = try_int(idxs)
118
        if isinstance(idxs, Integral): return self.get(idxs)
119
        else: return self.new(self.items[idxs], inner_df=index_row(self.inner_df, idxs))
120

121
    @classmethod
122
    def from_folder(cls, path:PathOrStr, extensions:Collection[str]=None, recurse:bool=True,
123
                    include:Optional[Collection[str]]=None, processor:PreProcessors=None, presort:Optional[bool]=False, **kwargs)->'ItemList':
124
        """Create an `ItemList` in `path` from the filenames that have a suffix in `extensions`.
125
        `recurse` determines if we search subfolders."""
126
        path = Path(path)
127
        return cls(get_files(path, extensions, recurse=recurse, include=include, presort=presort), path=path, processor=processor, **kwargs)
128

129
    @classmethod
130
    def from_df(cls, df:DataFrame, path:PathOrStr='.', cols:IntsOrStrs=0, processor:PreProcessors=None, **kwargs)->'ItemList':
131
        "Create an `ItemList` in `path` from the inputs in the `cols` of `df`."
132
        inputs = df.iloc[:,df_names_to_idx(cols, df)]
133
        assert not inputs.isna().any().any(), f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
134
        res = cls(items=_maybe_squeeze(inputs.values), path=path, inner_df=df, processor=processor, **kwargs)
135
        return res
136

137
    @classmethod
138
    def from_csv(cls, path:PathOrStr, csv_name:str, cols:IntsOrStrs=0, delimiter:str=None, header:str='infer',
139
                 processor:PreProcessors=None, **kwargs)->'ItemList':
140
        """Create an `ItemList` in `path` from the inputs in the `cols` of `path/csv_name`"""
141
        df = pd.read_csv(Path(path)/csv_name, delimiter=delimiter, header=header)
142
        return cls.from_df(df, path=path, cols=cols, processor=processor, **kwargs)
143

144
    def _relative_item_path(self, i): return self.items[i].relative_to(self.path)
145
    def _relative_item_paths(self):   return [self._relative_item_path(i) for i in range_of(self.items)]
146

147
    def use_partial_data(self, sample_pct:float=0.01, seed:int=None)->'ItemList':
148
        "Use only a sample of `sample_pct`of the full dataset and an optional `seed`."
149
        if seed is not None: np.random.seed(seed)
150
        rand_idx = np.random.permutation(range_of(self))
151
        cut = int(sample_pct * len(self))
152
        return self[rand_idx[:cut]]
153

154
    def to_text(self, fn:str):
155
        "Save `self.items` to `fn` in `self.path`."
156
        with open(self.path/fn, 'w') as f: f.writelines([f'{o}\n' for o in self._relative_item_paths()])
157

158
    def filter_by_func(self, func:Callable)->'ItemList':
159
        "Only keep elements for which `func` returns `True`."
160
        self.items = array([o for o in self.items if func(o)])
161
        return self
162

163
    def filter_by_folder(self, include=None, exclude=None):
164
        "Only keep filenames in `include` folder or reject the ones in `exclude`."
165
        include,exclude = listify(include),listify(exclude)
166
        def _inner(o):
167
            if isinstance(o, Path): n = o.relative_to(self.path).parts[0]
168
            else: n = o.split(os.path.sep)[len(str(self.path).split(os.path.sep))]
169
            if include and not n in include: return False
170
            if exclude and     n in exclude: return False
171
            return True
172
        return self.filter_by_func(_inner)
173

174
    def filter_by_rand(self, p:float, seed:int=None):
175
        "Keep random sample of `items` with probability `p` and an optional `seed`."
176
        if seed is not None: set_all_seed(seed)
177
        return self.filter_by_func(lambda o: rand_bool(p))
178

179
    def no_split(self):
180
        warn("`no_split` is deprecated, please use `split_none`.")
181
        return self.split_none()
182

183
    def split_none(self):
184
        "Don't split the data and create an empty validation set."
185
        val = self[[]]
186
        val.ignore_empty = True
187
        return self._split(self.path, self, val)
188

189
    def split_by_list(self, train, valid):
190
        "Split the data between `train` and `valid`."
191
        return self._split(self.path, train, valid)
192

193
    def split_by_idxs(self, train_idx, valid_idx):
194
        "Split the data between `train_idx` and `valid_idx`."
195
        return self.split_by_list(self[train_idx], self[valid_idx])
196

197
    def split_by_idx(self, valid_idx:Collection[int])->'ItemLists':
198
        "Split the data according to the indexes in `valid_idx`."
199
        #train_idx = [i for i in range_of(self.items) if i not in valid_idx]
200
        train_idx = np.setdiff1d(arange_of(self.items), valid_idx)
201
        return self.split_by_idxs(train_idx, valid_idx)
202

203
    def _get_by_folder(self, name):
204
        return [i for i in range_of(self) if (self.items[i].parts[self.num_parts] if isinstance(self.items[i], Path)
205
                else self.items[i].split(os.path.sep)[0]) == name ]
206

207
    def split_by_folder(self, train:str='train', valid:str='valid')->'ItemLists':
208
        "Split the data depending on the folder (`train` or `valid`) in which the filenames are."
209
        return self.split_by_idxs(self._get_by_folder(train), self._get_by_folder(valid))
210

211
    def random_split_by_pct(self, valid_pct:float=0.2, seed:int=None):
212
        warn("`random_split_by_pct` is deprecated, please use `split_by_rand_pct`.")
213
        return self.split_by_rand_pct(valid_pct=valid_pct, seed=seed)
214

215
    def split_by_rand_pct(self, valid_pct:float=0.2, seed:int=None)->'ItemLists':
216
        "Split the items randomly by putting `valid_pct` in the validation set, optional `seed` can be passed."
217
        if valid_pct==0.: return self.split_none()
218
        if seed is not None: np.random.seed(seed)
219
        rand_idx = np.random.permutation(range_of(self))
220
        cut = int(valid_pct * len(self))
221
        return self.split_by_idx(rand_idx[:cut])
222

223
    def split_subsets(self, train_size:float, valid_size:float, seed=None) -> 'ItemLists':
224
        "Split the items into train set with size `train_size * n` and valid set with size `valid_size * n`."
225
        assert 0 < train_size < 1
226
        assert 0 < valid_size < 1
227
        assert train_size + valid_size <= 1.
228
        if seed is not None: np.random.seed(seed)
229
        n = len(self.items)
230
        rand_idx = np.random.permutation(range(n))
231
        train_cut, valid_cut = int(train_size * n), int(valid_size * n)
232
        return self.split_by_idxs(rand_idx[:train_cut], rand_idx[-valid_cut:])
233

234
    def split_by_valid_func(self, func:Callable)->'ItemLists':
235
        "Split the data by result of `func` (which returns `True` for validation set)."
236
        valid_idx = [i for i,o in enumerate(self.items) if func(o)]
237
        return self.split_by_idx(valid_idx)
238

239
    def split_by_files(self, valid_names:'ItemList')->'ItemLists':
240
        "Split the data by using the names in `valid_names` for validation."
241
        if isinstance(self.items[0], Path): return self.split_by_valid_func(lambda o: o.name in valid_names)
242
        else: return self.split_by_valid_func(lambda o: os.path.basename(o) in valid_names)
243

244
    def split_by_fname_file(self, fname:PathOrStr, path:PathOrStr=None)->'ItemLists':
245
        "Split the data by using the names in `fname` for the validation set. `path` will override `self.path`."
246
        path = Path(ifnone(path, self.path))
247
        valid_names = loadtxt_str(path/fname)
248
        return self.split_by_files(valid_names)
249

250
    def split_from_df(self, col:IntsOrStrs=2):
251
        "Split the data from the `col` in the dataframe in `self.inner_df`."
252
        valid_idx = np.where(self.inner_df.iloc[:,df_names_to_idx(col, self.inner_df)])[0]
253
        return self.split_by_idx(valid_idx)
254

255
    def get_label_cls(self, labels, label_cls:Callable=None, label_delim:str=None, **kwargs):
256
        "Return `label_cls` or guess one from the first element of `labels`."
257
        if label_cls is not None:               return label_cls
258
        if self.label_cls is not None:          return self.label_cls
259
        if label_delim is not None:             return MultiCategoryList
260
        it = index_row(labels,0)
261
        if isinstance(it, (float, np.float32)): return FloatList
262
        if isinstance(try_int(it), (str, Integral)):  return CategoryList
263
        if isinstance(it, Collection):          return MultiCategoryList
264
        return ItemList #self.__class__
265

266
    def _label_from_list(self, labels:Iterator, label_cls:Callable=None, from_item_lists:bool=False, **kwargs)->'LabelList':
267
        "Label `self.items` with `labels`."
268
        if not from_item_lists:
269
            raise Exception("Your data isn't split, if you don't want a validation set, please use `split_none`.")
270
        labels = array(labels, dtype=object)
271
        label_cls = self.get_label_cls(labels, label_cls=label_cls, **kwargs)
272
        y = label_cls(labels, path=self.path, **kwargs)
273
        res = self._label_list(x=self, y=y)
274
        return res
275

276
    def label_from_df(self, cols:IntsOrStrs=1, label_cls:Callable=None, **kwargs):
277
        "Label `self.items` from the values in `cols` in `self.inner_df`."
278
        labels = self.inner_df.iloc[:,df_names_to_idx(cols, self.inner_df)]
279
        assert labels.isna().sum().sum() == 0, f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
280
        if is_listy(cols) and len(cols) > 1 and (label_cls is None or label_cls == MultiCategoryList):
281
            new_kwargs,label_cls = dict(one_hot=True, classes= cols),MultiCategoryList
282
            kwargs = {**new_kwargs, **kwargs}
283
        return self._label_from_list(_maybe_squeeze(labels), label_cls=label_cls, **kwargs)
284

285
    def label_const(self, const:Any=0, label_cls:Callable=None, **kwargs)->'LabelList':
286
        "Label every item with `const`."
287
        return self.label_from_func(func=lambda o: const, label_cls=label_cls, **kwargs)
288

289
    def label_empty(self, **kwargs):
290
        "Label every item with an `EmptyLabel`."
291
        kwargs['label_cls'] = EmptyLabelList
292
        return self.label_from_func(func=lambda o: 0., **kwargs)
293

294
    def label_from_func(self, func:Callable, label_cls:Callable=None, **kwargs)->'LabelList':
295
        "Apply `func` to every input to get its label."
296
        return self._label_from_list([func(o) for o in self.items], label_cls=label_cls, **kwargs)
297

298
    def label_from_folder(self, label_cls:Callable=None, **kwargs)->'LabelList':
299
        "Give a label to each filename depending on its folder."
300
        return self.label_from_func(func=lambda o: (o.parts if isinstance(o, Path) else o.split(os.path.sep))[-2],
301
                                    label_cls=label_cls, **kwargs)
302

303
    def label_from_re(self, pat:str, full_path:bool=False, label_cls:Callable=None, **kwargs)->'LabelList':
304
        "Apply the re in `pat` to determine the label of every filename.  If `full_path`, search in the full name."
305
        pat = re.compile(pat)
306
        def _inner(o):
307
            s = str((os.path.join(self.path,o) if full_path else o).as_posix())
308
            res = pat.search(s)
309
            assert res,f'Failed to find "{pat}" in "{s}"'
310
            return res.group(1)
311
        return self.label_from_func(_inner, label_cls=label_cls, **kwargs)
312

313
    def databunch(self, **kwargs):
314
        "To throw a clear error message when the data wasn't split and labeled."
315
        raise Exception("Your data is neither split nor labeled, can't turn it into a `DataBunch` yet.")
316

317
class EmptyLabelList(ItemList):
318
    "Basic `ItemList` for dummy labels."
319
    def get(self, i): return EmptyLabel()
320
    def reconstruct(self, t:Tensor, x:Tensor=None):
321
        if len(t.size()) == 0: return EmptyLabel()
322
        return self.x.reconstruct(t,x) if has_arg(self.x.reconstruct, 'x') else self.x.reconstruct(t)
323

324
class CategoryProcessor(PreProcessor):
325
    "`PreProcessor` that create `classes` from `ds.items` and handle the mapping."
326
    def __init__(self, ds:ItemList):
327
        self.create_classes(ds.classes)
328
        self.state_attrs,self.warns = ['classes'],[]
329

330
    def create_classes(self, classes):
331
        self.classes = classes
332
        if classes is not None: self.c2i = {v:k for k,v in enumerate(classes)}
333

334
    def generate_classes(self, items):
335
        "Generate classes from `items` by taking the sorted unique values."
336
        return uniqueify(items, sort=True)
337

338
    def process_one(self,item):
339
        if isinstance(item, EmptyLabel): return item
340
        res = self.c2i.get(item,None)
341
        if res is None: self.warns.append(str(item))
342
        return res
343

344
    def process(self, ds):
345
        if self.classes is None: self.create_classes(self.generate_classes(ds.items))
346
        ds.classes = self.classes
347
        ds.c2i = self.c2i
348
        super().process(ds)
349

350
    def __getstate__(self): return {n:getattr(self,n) for n in self.state_attrs}
351
    def __setstate__(self, state:dict):
352
        self.create_classes(state['classes'])
353
        self.state_attrs = state.keys()
354
        for n in state.keys():
355
            if n!='classes': setattr(self, n, state[n])
356

357
class CategoryListBase(ItemList):
358
    "Basic `ItemList` for classification."
359
    def __init__(self, items:Iterator, classes:Collection=None, **kwargs):
360
        self.classes=classes
361
        self.filter_missing_y = True
362
        super().__init__(items, **kwargs)
363
        self.copy_new.append('classes')
364

365
    @property
366
    def c(self): return len(self.classes)
367

368
class CategoryList(CategoryListBase):
369
    "Basic `ItemList` for single classification labels."
370
    _processor=CategoryProcessor
371
    def __init__(self, items:Iterator, classes:Collection=None, label_delim:str=None, **kwargs):
372
        super().__init__(items, classes=classes, **kwargs)
373
        self.loss_func = CrossEntropyFlat()
374

375
    def get(self, i):
376
        o = self.items[i]
377
        if o is None: return None
378
        return Category(o, self.classes[o])
379

380
    def analyze_pred(self, pred, thresh:float=0.5): return pred.argmax()
381

382
    def reconstruct(self, t):
383
        return Category(t, self.classes[t])
384

385
class MultiCategoryProcessor(CategoryProcessor):
386
    "`PreProcessor` that create `classes` from `ds.items` and handle the mapping."
387
    def __init__(self, ds:ItemList, one_hot:bool=False):
388
        super().__init__(ds)
389
        self.one_hot = one_hot
390
        self.state_attrs.append('one_hot')
391

392
    def process_one(self,item):
393
        if self.one_hot or isinstance(item, EmptyLabel): return item
394
        res = [super(MultiCategoryProcessor, self).process_one(o) for o in item]
395
        return [r for r in res if r is not None]
396

397
    def generate_classes(self, items):
398
        "Generate classes from `items` by taking the sorted unique values."
399
        classes = set()
400
        for c in items: classes = classes.union(set(c))
401
        classes = list(classes)
402
        classes.sort()
403
        return classes
404

405
class MultiCategoryList(CategoryListBase):
406
    "Basic `ItemList` for multi-classification labels."
407
    _processor=MultiCategoryProcessor
408
    def __init__(self, items:Iterator, classes:Collection=None, label_delim:str=None, one_hot:bool=False, **kwargs):
409
        if label_delim is not None: items = array(csv.reader(items.astype(str), delimiter=label_delim))
410
        super().__init__(items, classes=classes, **kwargs)
411
        if one_hot:
412
            assert classes is not None, "Please provide class names with `classes=...`"
413
            self.processor = [MultiCategoryProcessor(self, one_hot=True)]
414
        self.loss_func = BCEWithLogitsFlat()
415
        self.one_hot = one_hot
416
        self.copy_new += ['one_hot']
417

418
    def get(self, i):
419
        o = self.items[i]
420
        if o is None: return None
421
        if self.one_hot: return self.reconstruct(o.astype(np.float32))
422
        return MultiCategory(one_hot(o, self.c), [self.classes[p] for p in o], o)
423

424
    def analyze_pred(self, pred, thresh:float=0.5):
425
        return (pred >= thresh).float()
426

427
    def reconstruct(self, t):
428
        o = [i for i in range(self.c) if t[i] == 1.]
429
        return MultiCategory(t, [self.classes[p] for p in o], o)
430

431
class FloatList(ItemList):
432
    "`ItemList` suitable for storing the floats in items for regression. Will add a `log` if this flag is `True`."
433
    def __init__(self, items:Iterator, log:bool=False, classes:Collection=None, **kwargs):
434
        super().__init__(np.array(items, dtype=np.float32), **kwargs)
435
        self.log = log
436
        self.copy_new.append('log')
437
        self.c = self.items.shape[1] if len(self.items.shape) > 1 else 1
438
        self.loss_func = MSELossFlat()
439

440
    def get(self, i):
441
        o = super().get(i)
442
        return FloatItem(np.log(o) if self.log else o)
443

444
    def reconstruct(self,t): return FloatItem(t.numpy())
445

446
class ItemLists():
447
    "An `ItemList` for each of `train` and `valid` (optional `test`)."
448
    def __init__(self, path:PathOrStr, train:ItemList, valid:ItemList):
449
        self.path,self.train,self.valid,self.test = Path(path),train,valid,None
450
        if not self.train.ignore_empty and len(self.train.items) == 0:
451
            warn("Your training set is empty. If this is by design, pass `ignore_empty=True` to remove this warning.")
452
        if not self.valid.ignore_empty and len(self.valid.items) == 0:
453
            warn("""Your validation set is empty. If this is by design, use `split_none()`
454
                 or pass `ignore_empty=True` when labelling to remove this warning.""")
455
        if isinstance(self.train, LabelList): self.__class__ = LabelLists
456

457
    def __dir__(self)->List[str]:
458
        default_dir = dir(type(self)) + list(self.__dict__.keys())
459
        add_ons = ['label_const', 'label_empty', 'label_from_df', 'label_from_folder', 'label_from_func',
460
                   'label_from_list', 'label_from_re']
461
        return default_dir + add_ons
462

463
    def __repr__(self)->str:
464
        return f'{self.__class__.__name__};\n\nTrain: {self.train};\n\nValid: {self.valid};\n\nTest: {self.test}'
465

466
    def __getattr__(self, k):
467
        ft = getattr(self.train, k)
468
        if not isinstance(ft, Callable): return ft
469
        fv = getattr(self.valid, k)
470
        assert isinstance(fv, Callable)
471
        def _inner(*args, **kwargs):
472
            self.train = ft(*args, from_item_lists=True, **kwargs)
473
            assert isinstance(self.train, LabelList)
474
            kwargs['label_cls'] = self.train.y.__class__
475
            self.valid = fv(*args, from_item_lists=True, **kwargs)
476
            self.__class__ = LabelLists
477
            self.process()
478
            return self
479
        return _inner
480

481
    def __setstate__(self,data:Any): self.__dict__.update(data)
482

483
    @property
484
    def lists(self):
485
        res = [self.train,self.valid]
486
        if self.test is not None: res.append(self.test)
487
        return res
488

489
    def label_from_lists(self, train_labels:Iterator, valid_labels:Iterator, label_cls:Callable=None, **kwargs)->'LabelList':
490
        "Use the labels in `train_labels` and `valid_labels` to label the data. `label_cls` will overwrite the default."
491
        label_cls = self.train.get_label_cls(train_labels, label_cls)
492
        self.train = self.train._label_list(x=self.train, y=label_cls(train_labels, **kwargs))
493
        self.valid = self.valid._label_list(x=self.valid, y=self.train.y.new(valid_labels, **kwargs))
494
        self.__class__ = LabelLists
495
        self.process()
496
        return self
497

498
    def transform(self, tfms:Optional[Tuple[TfmList,TfmList]]=(None,None), **kwargs):
499
        "Set `tfms` to be applied to the xs of the train and validation set."
500
        if not tfms: tfms=(None,None)
501
        assert is_listy(tfms) and len(tfms) == 2, "Please pass a list of two lists of transforms (train and valid)."
502
        self.train.transform(tfms[0], **kwargs)
503
        self.valid.transform(tfms[1], **kwargs)
504
        if self.test: self.test.transform(tfms[1], **kwargs)
505
        return self
506

507
    def transform_y(self, tfms:Optional[Tuple[TfmList,TfmList]]=(None,None), **kwargs):
508
        "Set `tfms` to be applied to the ys of the train and validation set."
509
        if not tfms: tfms=(None,None)
510
        self.train.transform_y(tfms[0], **kwargs)
511
        self.valid.transform_y(tfms[1], **kwargs)
512
        if self.test: self.test.transform_y(tfms[1], **kwargs)
513
        return self
514

515
    def databunch(self, **kwargs):
516
        "To throw a clear error message when the data wasn't labeled."
517
        raise Exception("Your data isn't labeled, can't turn it into a `DataBunch` yet!")
518

519
class LabelLists(ItemLists):
520
    "A `LabelList` for each of `train` and `valid` (optional `test`)."
521
    def get_processors(self):
522
        "Read the default class processors if none have been set."
523
        procs_x,procs_y = listify(self.train.x._processor),listify(self.train.y._processor)
524
        xp = ifnone(self.train.x.processor, [p(ds=self.train.x) for p in procs_x])
525
        yp = ifnone(self.train.y.processor, [p(ds=self.train.y) for p in procs_y])
526
        return xp,yp
527

528
    def process(self):
529
        "Process the inner datasets."
530
        xp,yp = self.get_processors()
531
        for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
532
        #progress_bar clear the outputs so in some case warnings issued during processing disappear.
533
        for ds in self.lists:
534
            if getattr(ds, 'warn', False): warn(ds.warn)
535
        return self
536

537
    def filter_by_func(self, func:Callable):
538
        for ds in self.lists: ds.filter_by_func(func)
539
        return self
540

541
    def databunch(self, path:PathOrStr=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus,
542
                  dl_tfms:Optional[Collection[Callable]]=None, device:torch.device=None, collate_fn:Callable=data_collate,
543
                  no_check:bool=False, **kwargs)->'DataBunch':
544
        "Create an `DataBunch` from self, `path` will override `self.path`, `kwargs` are passed to `DataBunch.create`."
545
        path = Path(ifnone(path, self.path))
546
        data = self.x._bunch.create(self.train, self.valid, test_ds=self.test, path=path, bs=bs, val_bs=val_bs,
547
                                    num_workers=num_workers, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn, no_check=no_check, **kwargs)
548
        if getattr(self, 'normalize', False):#In case a normalization was serialized
549
            norm = self.normalize
550
            data.normalize((norm['mean'], norm['std']), do_x=norm['do_x'], do_y=norm['do_y'])
551
        data.label_list = self
552
        return data
553

554
    def add_test(self, items:Iterator, label:Any=None, tfms=None, tfm_y=None):
555
        "Add test set containing `items` with an arbitrary `label`."
556
        # if no label passed, use label of first training item
557
        if label is None: labels = EmptyLabelList([0] * len(items))
558
        else: labels = self.valid.y.new([label] * len(items)).process()
559
        if isinstance(items, MixedItemList): items = self.valid.x.new(items.item_lists, inner_df=items.inner_df).process()
560
        elif isinstance(items, ItemList): items = self.valid.x.new(items.items, inner_df=items.inner_df).process()
561
        else: items = self.valid.x.new(items).process()
562
        self.test = self.valid.new(items, labels, tfms=tfms, tfm_y=tfm_y)
563
        return self
564

565
    def add_test_folder(self, test_folder:str='test', label:Any=None, tfms=None, tfm_y=None):
566
        "Add test set containing items from `test_folder` and an arbitrary `label`."
567
        # note: labels will be ignored if available in the test dataset
568
        items = self.x.__class__.from_folder(self.path/test_folder)
569
        return self.add_test(items.items, label=label, tfms=tfms, tfm_y=tfm_y)
570

571
    @classmethod
572
    def load_state(cls, path:PathOrStr, state:dict):
573
        "Create a `LabelLists` with empty sets from the serialized `state`."
574
        path = Path(path)
575
        train_ds = LabelList.load_state(path, state)
576
        valid_ds = LabelList.load_state(path, state)
577
        return LabelLists(path, train=train_ds, valid=valid_ds)
578

579
    @classmethod
580
    def load_empty(cls, path:PathOrStr, fn:PathOrStr='export.pkl'):
581
        "Create a `LabelLists` with empty sets from the serialized file in `path/fn`."
582
        path = Path(path)
583
        state = torch.load(open(path/fn, 'rb'))
584
        return LabelLists.load_state(path, state)
585

586
def _check_kwargs(ds:ItemList, tfms:TfmList, **kwargs):
587
    tfms = listify(tfms)
588
    if (tfms is None or len(tfms) == 0) and len(kwargs) == 0: return
589
    if len(ds.items) >= 1:
590
        x = ds[0]
591
        try: x.apply_tfms(tfms, **kwargs)
592
        except Exception as e:
593
            raise Exception(f"It's not possible to apply those transforms to your dataset:\n {e}")
594

595
class LabelList(Dataset):
596
    "A list of inputs `x` and labels `y` with optional `tfms`."
597
    def __init__(self, x:ItemList, y:ItemList, tfms:TfmList=None, tfm_y:bool=False, **kwargs):
598
        self.x,self.y,self.tfm_y = x,y,tfm_y
599
        self.y.x = x
600
        self.item=None
601
        self.transform(tfms, **kwargs)
602

603
    def __len__(self)->int: return len(self.x) if self.item is None else 1
604

605
    @contextmanager
606
    def set_item(self,item):
607
        "For inference, will briefly replace the dataset with one that only contains `item`."
608
        self.item = self.x.process_one(item)
609
        yield None
610
        self.item = None
611

612
    def __repr__(self)->str:
613
        items = [self[i] for i in range(min(5,len(self.items)))]
614
        res = f'{self.__class__.__name__} ({len(self.items)} items)\n'
615
        res += f'x: {self.x.__class__.__name__}\n{show_some([i[0] for i in items])}\n'
616
        res += f'y: {self.y.__class__.__name__}\n{show_some([i[1] for i in items])}\n'
617
        return res + f'Path: {self.path}'
618

619
    def predict(self, res):
620
        "Delegates predict call on `res` to `self.y`."
621
        return self.y.predict(res)
622

623
    @property
624
    def c(self): return self.y.c
625

626
    def new(self, x, y, tfms=None, tfm_y=None, **kwargs)->'LabelList':
627
        tfms,tfm_y = ifnone(tfms, self.tfms),ifnone(tfm_y, self.tfm_y)
628
        if isinstance(x, ItemList):
629
            return self.__class__(x, y, tfms=tfms, tfm_y=tfm_y, **self.tfmargs)
630
        else:
631
            return self.new(self.x.new(x, **kwargs), self.y.new(y, **kwargs), tfms=tfms, tfm_y=tfm_y).process()
632

633
    def __getattr__(self,k:str)->Any:
634
        x = super().__getattribute__('x')
635
        res = getattr(x, k, None)
636
        if res is not None and k not in ['classes', 'c']: return res
637
        y = super().__getattribute__('y')
638
        res = getattr(y, k, None)
639
        if res is not None: return res
640
        raise AttributeError(k)
641

642
    def __setstate__(self,data:Any): self.__dict__.update(data)
643

644
    def __getitem__(self,idxs:Union[int,np.ndarray])->'LabelList':
645
        "return a single (x, y) if `idxs` is an integer or a new `LabelList` object if `idxs` is a range."
646
        idxs = try_int(idxs)
647
        if isinstance(idxs, Integral):
648
            if self.item is None: x,y = self.x[idxs],self.y[idxs]
649
            else:                 x,y = self.item   ,0
650
            if self.tfms or self.tfmargs:
651
                x = x.apply_tfms(self.tfms, is_x=True, **self.tfmargs)
652
            if hasattr(self, 'tfms_y') and self.tfm_y and self.item is None:
653
                y = y.apply_tfms(self.tfms_y, is_x=False, **{**self.tfmargs_y, 'do_resolve':False})
654
            if y is None: y=0
655
            return x,y
656
        else: return self.new(self.x[idxs], self.y[idxs])
657

658
    def to_df(self)->None:
659
        "Create `pd.DataFrame` containing `items` from `self.x` and `self.y`."
660
        return pd.DataFrame(dict(x=self.x._relative_item_paths(), y=[str(o) for o in self.y]))
661

662
    def to_csv(self, dest:str)->None:
663
        "Save `self.to_df()` to a CSV file in `self.path`/`dest`."
664
        self.to_df().to_csv(self.path/dest, index=False)
665

666
    def get_state(self, **kwargs):
667
        "Return the minimal state for export."
668
        state = {'x_cls':self.x.__class__, 'x_proc':self.x.processor,
669
                 'y_cls':self.y.__class__, 'y_proc':self.y.processor,
670
                 'tfms':self.tfms, 'tfm_y':self.tfm_y, 'tfmargs':self.tfmargs}
671
        if hasattr(self, 'tfms_y'):    state['tfms_y']    = self.tfms_y
672
        if hasattr(self, 'tfmargs_y'): state['tfmargs_y'] = self.tfmargs_y
673
        return {**state, **kwargs}
674

675
    def export(self, fn:PathOrStr, **kwargs):
676
        "Export the minimal state and save it in `fn` to load an empty version for inference."
677
        pickle.dump(self.get_state(**kwargs), open(fn, 'wb'))
678

679
    @classmethod
680
    def load_empty(cls, path:PathOrStr, fn:PathOrStr):
681
        "Load the state in `fn` to create an empty `LabelList` for inference."
682
        return cls.load_state(path, pickle.load(open(Path(path)/fn, 'rb')))
683

684
    @classmethod
685
    def load_state(cls, path:PathOrStr, state:dict) -> 'LabelList':
686
        "Create a `LabelList` from `state`."
687
        x = state['x_cls']([], path=path, processor=state['x_proc'], ignore_empty=True)
688
        y = state['y_cls']([], path=path, processor=state['y_proc'], ignore_empty=True)
689
        res = cls(x, y, tfms=state['tfms'], tfm_y=state['tfm_y'], **state['tfmargs']).process()
690
        if state.get('tfms_y', False):    res.tfms_y    = state['tfms_y']
691
        if state.get('tfmargs_y', False): res.tfmargs_y = state['tfmargs_y']
692
        if state.get('normalize', False): res.normalize = state['normalize']
693
        return res
694

695
    def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):
696
        "Launch the processing on `self.x` and `self.y` with `xp` and `yp`."
697
        self.y.process(yp)
698
        if getattr(self.y, 'filter_missing_y', False):
699
            filt = array([o is None for o in self.y.items])
700
            if filt.sum()>0:
701
                #Warnings are given later since progress_bar might make them disappear.
702
                self.warn = f"You are labelling your items with {self.y.__class__.__name__}.\n"
703
                self.warn += f"Your {name} set contained the following unknown labels, the corresponding items have been discarded.\n"
704
                for p in self.y.processor:
705
                    if len(getattr(p, 'warns', [])) > 0:
706
                        warnings = list(set(p.warns))
707
                        self.warn += ', '.join(warnings[:5])
708
                        if len(warnings) > 5: self.warn += "..."
709
                    p.warns = []
710
                self.x,self.y = self.x[~filt],self.y[~filt]
711
        self.x.process(xp)
712
        return self
713

714
    def filter_by_func(self, func:Callable):
715
        filt = array([func(x,y) for x,y in zip(self.x.items, self.y.items)])
716
        self.x,self.y = self.x[~filt],self.y[~filt]
717
        return self
718

719
    def transform(self, tfms:TfmList, tfm_y:bool=None, **kwargs):
720
        "Set the `tfms` and `tfm_y` value to be applied to the inputs and targets."
721
        _check_kwargs(self.x, tfms, **kwargs)
722
        if tfm_y is None: tfm_y = self.tfm_y
723
        tfms_y = None if tfms is None else list(filter(lambda t: getattr(t, 'use_on_y', True), listify(tfms)))
724
        if tfm_y: _check_kwargs(self.y, tfms_y, **kwargs)
725
        self.tfms,self.tfmargs  = tfms,kwargs
726
        self.tfm_y,self.tfms_y,self.tfmargs_y = tfm_y,tfms_y,kwargs
727
        return self
728

729
    def transform_y(self, tfms:TfmList=None, **kwargs):
730
        "Set `tfms` to be applied to the targets only."
731
        tfms_y = list(filter(lambda t: getattr(t, 'use_on_y', True), listify(self.tfms if tfms is None else tfms)))
732
        tfmargs_y = {**self.tfmargs, **kwargs} if tfms is None else kwargs
733
        _check_kwargs(self.y, tfms_y, **tfmargs_y)
734
        self.tfm_y,self.tfms_y,self.tfmargs_y=True,tfms_y,tfmargs_y
735
        return self
736

737
    def databunch(self, **kwargs):
738
        "To throw a clear error message when the data wasn't split."
739
        raise Exception("Your data isn't split, if you don't want a validation set, please use `split_none`")
740

741
@classmethod
742
def _databunch_load_empty(cls, path, fname:str='export.pkl'):
743
    "Load an empty `DataBunch` from the exported file in `path/fname` with optional `tfms`."
744
    sd = LabelLists.load_empty(path, fn=fname)
745
    return sd.databunch()
746

747
DataBunch.load_empty = _databunch_load_empty
748

749
class MixedProcessor(PreProcessor):
750
    def __init__(self, procs:Collection[Union[PreProcessor, Collection[PreProcessor]]]):
751
        self.procs = procs
752

753
    def process_one(self, item:Any):
754
        res = []
755
        for procs, i in zip(self.procs, item):
756
            for p in procs: i = p.process_one(i)
757
            res.append(i)
758
        return res
759

760
    def process(self, ds:Collection):
761
        for procs, il in zip(self.procs, ds.item_lists):
762
            for p in procs: p.process(il)
763

764
class MixedItem(ItemBase):
765
    def __init__(self, items):
766
        self.obj = items
767
        self.data = [item.data for item in items]
768

769
    def __repr__(self): return '\n'.join([f'{self.__class__.__name__}'] + [repr(item) for item in self.obj])
770

771
    def apply_tfms(self, tfms:Collection, **kwargs):
772
        self.obj = [item.apply_tfms(t, **kwargs) for item,t in zip(self.obj, tfms)]
773
        self.data = [item.data for item in self.obj]
774
        return self
775

776
class MixedItemList(ItemList):
777

778
    def __init__(self, item_lists, path:PathOrStr=None, label_cls:Callable=None, inner_df:Any=None,
779
                 x:'ItemList'=None, ignore_empty:bool=False, processor=None):
780
        self.item_lists = item_lists
781
        if processor is None:
782
            default_procs = [[p(ds=il) for p in listify(il._processor)] for il in item_lists]
783
            processor = MixedProcessor([ifnone(il.processor, dp) for il,dp in zip(item_lists, default_procs)])
784
        items = range_of(item_lists[0]) if len(item_lists) >= 1 else []
785
        if path is None and len(item_lists) >= 1: path = item_lists[0].path
786
        super().__init__(items, processor=processor, path=path,
787
                         label_cls=label_cls, inner_df=inner_df, x=x, ignore_empty=ignore_empty)
788

789
    def new(self, item_lists, processor:PreProcessor=None, **kwargs)->'ItemList':
790
        "Create a new `ItemList` from `items`, keeping the same attributes."
791
        processor = ifnone(processor, self.processor)
792
        copy_d = {o:getattr(self,o) for o in self.copy_new}
793
        kwargs = {**copy_d, **kwargs}
794
        return self.__class__(item_lists, processor=processor, **kwargs)
795

796
    def get(self, i):
797
        return MixedItem([il.get(i) for il in self.item_lists])
798

799
    def __getitem__(self,idxs:int)->Any:
800
        idxs = try_int(idxs)
801
        if isinstance(idxs, Integral): return self.get(idxs)
802
        else:
803
            item_lists = [il.new(il.items[idxs], inner_df=index_row(il.inner_df, idxs)) for il in self.item_lists]
804
            return self.new(item_lists, inner_df=index_row(self.inner_df, idxs))
805

806
Product

Resources

Company