Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wiseplat
GitHub Repository: wiseplat/python-code
Path: blob/master/ invest-robot-contest_TinkoffBotTwitch-main/venv/lib/python3.8/site-packages/pandas/io/pytables.py
7826 views
1
"""
2
High level interface to PyTables for reading and writing pandas data structures
3
to disk
4
"""
5
from __future__ import annotations
6
7
from contextlib import suppress
8
import copy
9
from datetime import (
10
date,
11
tzinfo,
12
)
13
import itertools
14
import os
15
import re
16
from textwrap import dedent
17
from typing import (
18
TYPE_CHECKING,
19
Any,
20
Callable,
21
Hashable,
22
Literal,
23
Sequence,
24
cast,
25
)
26
import warnings
27
28
import numpy as np
29
30
from pandas._config import (
31
config,
32
get_option,
33
)
34
35
from pandas._libs import (
36
lib,
37
writers as libwriters,
38
)
39
from pandas._libs.tslibs import timezones
40
from pandas._typing import (
41
ArrayLike,
42
DtypeArg,
43
Shape,
44
)
45
from pandas.compat._optional import import_optional_dependency
46
from pandas.compat.pickle_compat import patch_pickle
47
from pandas.errors import PerformanceWarning
48
from pandas.util._decorators import cache_readonly
49
from pandas.util._exceptions import find_stack_level
50
51
from pandas.core.dtypes.common import (
52
ensure_object,
53
is_categorical_dtype,
54
is_complex_dtype,
55
is_datetime64_dtype,
56
is_datetime64tz_dtype,
57
is_extension_array_dtype,
58
is_list_like,
59
is_string_dtype,
60
is_timedelta64_dtype,
61
needs_i8_conversion,
62
)
63
from pandas.core.dtypes.missing import array_equivalent
64
65
from pandas import (
66
DataFrame,
67
DatetimeIndex,
68
Index,
69
MultiIndex,
70
PeriodIndex,
71
Series,
72
TimedeltaIndex,
73
concat,
74
isna,
75
)
76
from pandas.core.api import Int64Index
77
from pandas.core.arrays import (
78
Categorical,
79
DatetimeArray,
80
PeriodArray,
81
)
82
import pandas.core.common as com
83
from pandas.core.computation.pytables import (
84
PyTablesExpr,
85
maybe_expression,
86
)
87
from pandas.core.construction import extract_array
88
from pandas.core.indexes.api import ensure_index
89
from pandas.core.internals import (
90
ArrayManager,
91
BlockManager,
92
)
93
94
from pandas.io.common import stringify_path
95
from pandas.io.formats.printing import (
96
adjoin,
97
pprint_thing,
98
)
99
100
if TYPE_CHECKING:
101
from tables import (
102
Col,
103
File,
104
Node,
105
)
106
107
from pandas.core.internals import Block
108
109
110
# versioning attribute
111
_version = "0.15.2"
112
113
# encoding
114
_default_encoding = "UTF-8"
115
116
117
def _ensure_decoded(s):
118
"""if we have bytes, decode them to unicode"""
119
if isinstance(s, np.bytes_):
120
s = s.decode("UTF-8")
121
return s
122
123
124
def _ensure_encoding(encoding):
125
# set the encoding if we need
126
if encoding is None:
127
encoding = _default_encoding
128
129
return encoding
130
131
132
def _ensure_str(name):
133
"""
134
Ensure that an index / column name is a str (python 3); otherwise they
135
may be np.string dtype. Non-string dtypes are passed through unchanged.
136
137
https://github.com/pandas-dev/pandas/issues/13492
138
"""
139
if isinstance(name, str):
140
name = str(name)
141
return name
142
143
144
Term = PyTablesExpr
145
146
147
def _ensure_term(where, scope_level: int):
148
"""
149
Ensure that the where is a Term or a list of Term.
150
151
This makes sure that we are capturing the scope of variables that are
152
passed create the terms here with a frame_level=2 (we are 2 levels down)
153
"""
154
# only consider list/tuple here as an ndarray is automatically a coordinate
155
# list
156
level = scope_level + 1
157
if isinstance(where, (list, tuple)):
158
where = [
159
Term(term, scope_level=level + 1) if maybe_expression(term) else term
160
for term in where
161
if term is not None
162
]
163
elif maybe_expression(where):
164
where = Term(where, scope_level=level)
165
return where if where is None or len(where) else None
166
167
168
class PossibleDataLossError(Exception):
169
pass
170
171
172
class ClosedFileError(Exception):
173
pass
174
175
176
class IncompatibilityWarning(Warning):
177
pass
178
179
180
incompatibility_doc = """
181
where criteria is being ignored as this version [%s] is too old (or
182
not-defined), read the file in and write it out to a new file to upgrade (with
183
the copy_to method)
184
"""
185
186
187
class AttributeConflictWarning(Warning):
188
pass
189
190
191
attribute_conflict_doc = """
192
the [%s] attribute of the existing index is [%s] which conflicts with the new
193
[%s], resetting the attribute to None
194
"""
195
196
197
class DuplicateWarning(Warning):
198
pass
199
200
201
duplicate_doc = """
202
duplicate entries in table, taking most recently appended
203
"""
204
205
performance_doc = """
206
your performance may suffer as PyTables will pickle object types that it cannot
207
map directly to c-types [inferred_type->%s,key->%s] [items->%s]
208
"""
209
210
# formats
211
_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
212
213
# axes map
214
_AXES_MAP = {DataFrame: [0]}
215
216
# register our configuration options
217
dropna_doc = """
218
: boolean
219
drop ALL nan rows when appending to a table
220
"""
221
format_doc = """
222
: format
223
default format writing format, if None, then
224
put will default to 'fixed' and append will default to 'table'
225
"""
226
227
with config.config_prefix("io.hdf"):
228
config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
229
config.register_option(
230
"default_format",
231
None,
232
format_doc,
233
validator=config.is_one_of_factory(["fixed", "table", None]),
234
)
235
236
# oh the troubles to reduce import time
237
_table_mod = None
238
_table_file_open_policy_is_strict = False
239
240
241
def _tables():
242
global _table_mod
243
global _table_file_open_policy_is_strict
244
if _table_mod is None:
245
import tables
246
247
_table_mod = tables
248
249
# set the file open policy
250
# return the file open policy; this changes as of pytables 3.1
251
# depending on the HDF5 version
252
with suppress(AttributeError):
253
_table_file_open_policy_is_strict = (
254
tables.file._FILE_OPEN_POLICY == "strict"
255
)
256
257
return _table_mod
258
259
260
# interface to/from ###
261
262
263
def to_hdf(
264
path_or_buf,
265
key: str,
266
value: DataFrame | Series,
267
mode: str = "a",
268
complevel: int | None = None,
269
complib: str | None = None,
270
append: bool = False,
271
format: str | None = None,
272
index: bool = True,
273
min_itemsize: int | dict[str, int] | None = None,
274
nan_rep=None,
275
dropna: bool | None = None,
276
data_columns: Literal[True] | list[str] | None = None,
277
errors: str = "strict",
278
encoding: str = "UTF-8",
279
) -> None:
280
"""store this object, close it if we opened it"""
281
if append:
282
f = lambda store: store.append(
283
key,
284
value,
285
format=format,
286
index=index,
287
min_itemsize=min_itemsize,
288
nan_rep=nan_rep,
289
dropna=dropna,
290
data_columns=data_columns,
291
errors=errors,
292
encoding=encoding,
293
)
294
else:
295
# NB: dropna is not passed to `put`
296
f = lambda store: store.put(
297
key,
298
value,
299
format=format,
300
index=index,
301
min_itemsize=min_itemsize,
302
nan_rep=nan_rep,
303
data_columns=data_columns,
304
errors=errors,
305
encoding=encoding,
306
dropna=dropna,
307
)
308
309
path_or_buf = stringify_path(path_or_buf)
310
if isinstance(path_or_buf, str):
311
with HDFStore(
312
path_or_buf, mode=mode, complevel=complevel, complib=complib
313
) as store:
314
f(store)
315
else:
316
f(path_or_buf)
317
318
319
def read_hdf(
320
path_or_buf,
321
key=None,
322
mode: str = "r",
323
errors: str = "strict",
324
where=None,
325
start: int | None = None,
326
stop: int | None = None,
327
columns=None,
328
iterator=False,
329
chunksize: int | None = None,
330
**kwargs,
331
):
332
"""
333
Read from the store, close it if we opened it.
334
335
Retrieve pandas object stored in file, optionally based on where
336
criteria.
337
338
.. warning::
339
340
Pandas uses PyTables for reading and writing HDF5 files, which allows
341
serializing object-dtype data with pickle when using the "fixed" format.
342
Loading pickled data received from untrusted sources can be unsafe.
343
344
See: https://docs.python.org/3/library/pickle.html for more.
345
346
Parameters
347
----------
348
path_or_buf : str, path object, pandas.HDFStore
349
Any valid string path is acceptable. Only supports the local file system,
350
remote URLs and file-like objects are not supported.
351
352
If you want to pass in a path object, pandas accepts any
353
``os.PathLike``.
354
355
Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
356
357
key : object, optional
358
The group identifier in the store. Can be omitted if the HDF file
359
contains a single pandas object.
360
mode : {'r', 'r+', 'a'}, default 'r'
361
Mode to use when opening the file. Ignored if path_or_buf is a
362
:class:`pandas.HDFStore`. Default is 'r'.
363
errors : str, default 'strict'
364
Specifies how encoding and decoding errors are to be handled.
365
See the errors argument for :func:`open` for a full list
366
of options.
367
where : list, optional
368
A list of Term (or convertible) objects.
369
start : int, optional
370
Row number to start selection.
371
stop : int, optional
372
Row number to stop selection.
373
columns : list, optional
374
A list of columns names to return.
375
iterator : bool, optional
376
Return an iterator object.
377
chunksize : int, optional
378
Number of rows to include in an iteration when using an iterator.
379
**kwargs
380
Additional keyword arguments passed to HDFStore.
381
382
Returns
383
-------
384
item : object
385
The selected object. Return type depends on the object stored.
386
387
See Also
388
--------
389
DataFrame.to_hdf : Write a HDF file from a DataFrame.
390
HDFStore : Low-level access to HDF files.
391
392
Examples
393
--------
394
>>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
395
>>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
396
>>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
397
"""
398
if mode not in ["r", "r+", "a"]:
399
raise ValueError(
400
f"mode {mode} is not allowed while performing a read. "
401
f"Allowed modes are r, r+ and a."
402
)
403
# grab the scope
404
if where is not None:
405
where = _ensure_term(where, scope_level=1)
406
407
if isinstance(path_or_buf, HDFStore):
408
if not path_or_buf.is_open:
409
raise OSError("The HDFStore must be open for reading.")
410
411
store = path_or_buf
412
auto_close = False
413
else:
414
path_or_buf = stringify_path(path_or_buf)
415
if not isinstance(path_or_buf, str):
416
raise NotImplementedError(
417
"Support for generic buffers has not been implemented."
418
)
419
try:
420
exists = os.path.exists(path_or_buf)
421
422
# if filepath is too long
423
except (TypeError, ValueError):
424
exists = False
425
426
if not exists:
427
raise FileNotFoundError(f"File {path_or_buf} does not exist")
428
429
store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
430
# can't auto open/close if we are using an iterator
431
# so delegate to the iterator
432
auto_close = True
433
434
try:
435
if key is None:
436
groups = store.groups()
437
if len(groups) == 0:
438
raise ValueError(
439
"Dataset(s) incompatible with Pandas data types, "
440
"not table, or no datasets found in HDF5 file."
441
)
442
candidate_only_group = groups[0]
443
444
# For the HDF file to have only one dataset, all other groups
445
# should then be metadata groups for that candidate group. (This
446
# assumes that the groups() method enumerates parent groups
447
# before their children.)
448
for group_to_check in groups[1:]:
449
if not _is_metadata_of(group_to_check, candidate_only_group):
450
raise ValueError(
451
"key must be provided when HDF5 "
452
"file contains multiple datasets."
453
)
454
key = candidate_only_group._v_pathname
455
return store.select(
456
key,
457
where=where,
458
start=start,
459
stop=stop,
460
columns=columns,
461
iterator=iterator,
462
chunksize=chunksize,
463
auto_close=auto_close,
464
)
465
except (ValueError, TypeError, KeyError):
466
if not isinstance(path_or_buf, HDFStore):
467
# if there is an error, close the store if we opened it.
468
with suppress(AttributeError):
469
store.close()
470
471
raise
472
473
474
def _is_metadata_of(group: Node, parent_group: Node) -> bool:
475
"""Check if a given group is a metadata group for a given parent_group."""
476
if group._v_depth <= parent_group._v_depth:
477
return False
478
479
current = group
480
while current._v_depth > 1:
481
parent = current._v_parent
482
if parent == parent_group and current._v_name == "meta":
483
return True
484
current = current._v_parent
485
return False
486
487
488
class HDFStore:
489
"""
490
Dict-like IO interface for storing pandas objects in PyTables.
491
492
Either Fixed or Table format.
493
494
.. warning::
495
496
Pandas uses PyTables for reading and writing HDF5 files, which allows
497
serializing object-dtype data with pickle when using the "fixed" format.
498
Loading pickled data received from untrusted sources can be unsafe.
499
500
See: https://docs.python.org/3/library/pickle.html for more.
501
502
Parameters
503
----------
504
path : str
505
File path to HDF5 file.
506
mode : {'a', 'w', 'r', 'r+'}, default 'a'
507
508
``'r'``
509
Read-only; no data can be modified.
510
``'w'``
511
Write; a new file is created (an existing file with the same
512
name would be deleted).
513
``'a'``
514
Append; an existing file is opened for reading and writing,
515
and if the file does not exist it is created.
516
``'r+'``
517
It is similar to ``'a'``, but the file must already exist.
518
complevel : int, 0-9, default None
519
Specifies a compression level for data.
520
A value of 0 or None disables compression.
521
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
522
Specifies the compression library to be used.
523
As of v0.20.2 these additional compressors for Blosc are supported
524
(default if no compressor specified: 'blosc:blosclz'):
525
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
526
'blosc:zlib', 'blosc:zstd'}.
527
Specifying a compression library which is not available issues
528
a ValueError.
529
fletcher32 : bool, default False
530
If applying compression use the fletcher32 checksum.
531
**kwargs
532
These parameters will be passed to the PyTables open_file method.
533
534
Examples
535
--------
536
>>> bar = pd.DataFrame(np.random.randn(10, 4))
537
>>> store = pd.HDFStore('test.h5')
538
>>> store['foo'] = bar # write to HDF5
539
>>> bar = store['foo'] # retrieve
540
>>> store.close()
541
542
**Create or load HDF5 file in-memory**
543
544
When passing the `driver` option to the PyTables open_file method through
545
**kwargs, the HDF5 file is loaded or created in-memory and will only be
546
written when closed:
547
548
>>> bar = pd.DataFrame(np.random.randn(10, 4))
549
>>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
550
>>> store['foo'] = bar
551
>>> store.close() # only now, data is written to disk
552
"""
553
554
_handle: File | None
555
_mode: str
556
_complevel: int
557
_fletcher32: bool
558
559
def __init__(
560
self,
561
path,
562
mode: str = "a",
563
complevel: int | None = None,
564
complib=None,
565
fletcher32: bool = False,
566
**kwargs,
567
):
568
569
if "format" in kwargs:
570
raise ValueError("format is not a defined argument for HDFStore")
571
572
tables = import_optional_dependency("tables")
573
574
if complib is not None and complib not in tables.filters.all_complibs:
575
raise ValueError(
576
f"complib only supports {tables.filters.all_complibs} compression."
577
)
578
579
if complib is None and complevel is not None:
580
complib = tables.filters.default_complib
581
582
self._path = stringify_path(path)
583
if mode is None:
584
mode = "a"
585
self._mode = mode
586
self._handle = None
587
self._complevel = complevel if complevel else 0
588
self._complib = complib
589
self._fletcher32 = fletcher32
590
self._filters = None
591
self.open(mode=mode, **kwargs)
592
593
def __fspath__(self):
594
return self._path
595
596
@property
597
def root(self):
598
"""return the root node"""
599
self._check_if_open()
600
assert self._handle is not None # for mypy
601
return self._handle.root
602
603
@property
604
def filename(self):
605
return self._path
606
607
def __getitem__(self, key: str):
608
return self.get(key)
609
610
def __setitem__(self, key: str, value):
611
self.put(key, value)
612
613
def __delitem__(self, key: str):
614
return self.remove(key)
615
616
def __getattr__(self, name: str):
617
"""allow attribute access to get stores"""
618
try:
619
return self.get(name)
620
except (KeyError, ClosedFileError):
621
pass
622
raise AttributeError(
623
f"'{type(self).__name__}' object has no attribute '{name}'"
624
)
625
626
def __contains__(self, key: str) -> bool:
627
"""
628
check for existence of this key
629
can match the exact pathname or the pathnm w/o the leading '/'
630
"""
631
node = self.get_node(key)
632
if node is not None:
633
name = node._v_pathname
634
if name == key or name[1:] == key:
635
return True
636
return False
637
638
def __len__(self) -> int:
639
return len(self.groups())
640
641
def __repr__(self) -> str:
642
pstr = pprint_thing(self._path)
643
return f"{type(self)}\nFile path: {pstr}\n"
644
645
def __enter__(self):
646
return self
647
648
def __exit__(self, exc_type, exc_value, traceback):
649
self.close()
650
651
def keys(self, include: str = "pandas") -> list[str]:
652
"""
653
Return a list of keys corresponding to objects stored in HDFStore.
654
655
Parameters
656
----------
657
658
include : str, default 'pandas'
659
When kind equals 'pandas' return pandas objects.
660
When kind equals 'native' return native HDF5 Table objects.
661
662
.. versionadded:: 1.1.0
663
664
Returns
665
-------
666
list
667
List of ABSOLUTE path-names (e.g. have the leading '/').
668
669
Raises
670
------
671
raises ValueError if kind has an illegal value
672
"""
673
if include == "pandas":
674
return [n._v_pathname for n in self.groups()]
675
676
elif include == "native":
677
assert self._handle is not None # mypy
678
return [
679
n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
680
]
681
raise ValueError(
682
f"`include` should be either 'pandas' or 'native' but is '{include}'"
683
)
684
685
def __iter__(self):
686
return iter(self.keys())
687
688
def items(self):
689
"""
690
iterate on key->group
691
"""
692
for g in self.groups():
693
yield g._v_pathname, g
694
695
iteritems = items
696
697
def open(self, mode: str = "a", **kwargs):
698
"""
699
Open the file in the specified mode
700
701
Parameters
702
----------
703
mode : {'a', 'w', 'r', 'r+'}, default 'a'
704
See HDFStore docstring or tables.open_file for info about modes
705
**kwargs
706
These parameters will be passed to the PyTables open_file method.
707
"""
708
tables = _tables()
709
710
if self._mode != mode:
711
# if we are changing a write mode to read, ok
712
if self._mode in ["a", "w"] and mode in ["r", "r+"]:
713
pass
714
elif mode in ["w"]:
715
# this would truncate, raise here
716
if self.is_open:
717
raise PossibleDataLossError(
718
f"Re-opening the file [{self._path}] with mode [{self._mode}] "
719
"will delete the current file!"
720
)
721
722
self._mode = mode
723
724
# close and reopen the handle
725
if self.is_open:
726
self.close()
727
728
if self._complevel and self._complevel > 0:
729
self._filters = _tables().Filters(
730
self._complevel, self._complib, fletcher32=self._fletcher32
731
)
732
733
if _table_file_open_policy_is_strict and self.is_open:
734
msg = (
735
"Cannot open HDF5 file, which is already opened, "
736
"even in read-only mode."
737
)
738
raise ValueError(msg)
739
740
self._handle = tables.open_file(self._path, self._mode, **kwargs)
741
742
def close(self):
743
"""
744
Close the PyTables file handle
745
"""
746
if self._handle is not None:
747
self._handle.close()
748
self._handle = None
749
750
@property
751
def is_open(self) -> bool:
752
"""
753
return a boolean indicating whether the file is open
754
"""
755
if self._handle is None:
756
return False
757
return bool(self._handle.isopen)
758
759
def flush(self, fsync: bool = False):
760
"""
761
Force all buffered modifications to be written to disk.
762
763
Parameters
764
----------
765
fsync : bool (default False)
766
call ``os.fsync()`` on the file handle to force writing to disk.
767
768
Notes
769
-----
770
Without ``fsync=True``, flushing may not guarantee that the OS writes
771
to disk. With fsync, the operation will block until the OS claims the
772
file has been written; however, other caching layers may still
773
interfere.
774
"""
775
if self._handle is not None:
776
self._handle.flush()
777
if fsync:
778
with suppress(OSError):
779
os.fsync(self._handle.fileno())
780
781
def get(self, key: str):
782
"""
783
Retrieve pandas object stored in file.
784
785
Parameters
786
----------
787
key : str
788
789
Returns
790
-------
791
object
792
Same type as object stored in file.
793
"""
794
with patch_pickle():
795
# GH#31167 Without this patch, pickle doesn't know how to unpickle
796
# old DateOffset objects now that they are cdef classes.
797
group = self.get_node(key)
798
if group is None:
799
raise KeyError(f"No object named {key} in the file")
800
return self._read_group(group)
801
802
def select(
803
self,
804
key: str,
805
where=None,
806
start=None,
807
stop=None,
808
columns=None,
809
iterator=False,
810
chunksize=None,
811
auto_close: bool = False,
812
):
813
"""
814
Retrieve pandas object stored in file, optionally based on where criteria.
815
816
.. warning::
817
818
Pandas uses PyTables for reading and writing HDF5 files, which allows
819
serializing object-dtype data with pickle when using the "fixed" format.
820
Loading pickled data received from untrusted sources can be unsafe.
821
822
See: https://docs.python.org/3/library/pickle.html for more.
823
824
Parameters
825
----------
826
key : str
827
Object being retrieved from file.
828
where : list or None
829
List of Term (or convertible) objects, optional.
830
start : int or None
831
Row number to start selection.
832
stop : int, default None
833
Row number to stop selection.
834
columns : list or None
835
A list of columns that if not None, will limit the return columns.
836
iterator : bool or False
837
Returns an iterator.
838
chunksize : int or None
839
Number or rows to include in iteration, return an iterator.
840
auto_close : bool or False
841
Should automatically close the store when finished.
842
843
Returns
844
-------
845
object
846
Retrieved object from file.
847
"""
848
group = self.get_node(key)
849
if group is None:
850
raise KeyError(f"No object named {key} in the file")
851
852
# create the storer and axes
853
where = _ensure_term(where, scope_level=1)
854
s = self._create_storer(group)
855
s.infer_axes()
856
857
# function to call on iteration
858
def func(_start, _stop, _where):
859
return s.read(start=_start, stop=_stop, where=_where, columns=columns)
860
861
# create the iterator
862
it = TableIterator(
863
self,
864
s,
865
func,
866
where=where,
867
nrows=s.nrows,
868
start=start,
869
stop=stop,
870
iterator=iterator,
871
chunksize=chunksize,
872
auto_close=auto_close,
873
)
874
875
return it.get_result()
876
877
def select_as_coordinates(
878
self,
879
key: str,
880
where=None,
881
start: int | None = None,
882
stop: int | None = None,
883
):
884
"""
885
return the selection as an Index
886
887
.. warning::
888
889
Pandas uses PyTables for reading and writing HDF5 files, which allows
890
serializing object-dtype data with pickle when using the "fixed" format.
891
Loading pickled data received from untrusted sources can be unsafe.
892
893
See: https://docs.python.org/3/library/pickle.html for more.
894
895
896
Parameters
897
----------
898
key : str
899
where : list of Term (or convertible) objects, optional
900
start : integer (defaults to None), row number to start selection
901
stop : integer (defaults to None), row number to stop selection
902
"""
903
where = _ensure_term(where, scope_level=1)
904
tbl = self.get_storer(key)
905
if not isinstance(tbl, Table):
906
raise TypeError("can only read_coordinates with a table")
907
return tbl.read_coordinates(where=where, start=start, stop=stop)
908
909
def select_column(
910
self,
911
key: str,
912
column: str,
913
start: int | None = None,
914
stop: int | None = None,
915
):
916
"""
917
return a single column from the table. This is generally only useful to
918
select an indexable
919
920
.. warning::
921
922
Pandas uses PyTables for reading and writing HDF5 files, which allows
923
serializing object-dtype data with pickle when using the "fixed" format.
924
Loading pickled data received from untrusted sources can be unsafe.
925
926
See: https://docs.python.org/3/library/pickle.html for more.
927
928
Parameters
929
----------
930
key : str
931
column : str
932
The column of interest.
933
start : int or None, default None
934
stop : int or None, default None
935
936
Raises
937
------
938
raises KeyError if the column is not found (or key is not a valid
939
store)
940
raises ValueError if the column can not be extracted individually (it
941
is part of a data block)
942
943
"""
944
tbl = self.get_storer(key)
945
if not isinstance(tbl, Table):
946
raise TypeError("can only read_column with a table")
947
return tbl.read_column(column=column, start=start, stop=stop)
948
949
def select_as_multiple(
950
self,
951
keys,
952
where=None,
953
selector=None,
954
columns=None,
955
start=None,
956
stop=None,
957
iterator=False,
958
chunksize=None,
959
auto_close: bool = False,
960
):
961
"""
962
Retrieve pandas objects from multiple tables.
963
964
.. warning::
965
966
Pandas uses PyTables for reading and writing HDF5 files, which allows
967
serializing object-dtype data with pickle when using the "fixed" format.
968
Loading pickled data received from untrusted sources can be unsafe.
969
970
See: https://docs.python.org/3/library/pickle.html for more.
971
972
Parameters
973
----------
974
keys : a list of the tables
975
selector : the table to apply the where criteria (defaults to keys[0]
976
if not supplied)
977
columns : the columns I want back
978
start : integer (defaults to None), row number to start selection
979
stop : integer (defaults to None), row number to stop selection
980
iterator : bool, return an iterator, default False
981
chunksize : nrows to include in iteration, return an iterator
982
auto_close : bool, default False
983
Should automatically close the store when finished.
984
985
Raises
986
------
987
raises KeyError if keys or selector is not found or keys is empty
988
raises TypeError if keys is not a list or tuple
989
raises ValueError if the tables are not ALL THE SAME DIMENSIONS
990
"""
991
# default to single select
992
where = _ensure_term(where, scope_level=1)
993
if isinstance(keys, (list, tuple)) and len(keys) == 1:
994
keys = keys[0]
995
if isinstance(keys, str):
996
return self.select(
997
key=keys,
998
where=where,
999
columns=columns,
1000
start=start,
1001
stop=stop,
1002
iterator=iterator,
1003
chunksize=chunksize,
1004
auto_close=auto_close,
1005
)
1006
1007
if not isinstance(keys, (list, tuple)):
1008
raise TypeError("keys must be a list/tuple")
1009
1010
if not len(keys):
1011
raise ValueError("keys must have a non-zero length")
1012
1013
if selector is None:
1014
selector = keys[0]
1015
1016
# collect the tables
1017
tbls = [self.get_storer(k) for k in keys]
1018
s = self.get_storer(selector)
1019
1020
# validate rows
1021
nrows = None
1022
for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
1023
if t is None:
1024
raise KeyError(f"Invalid table [{k}]")
1025
if not t.is_table:
1026
raise TypeError(
1027
f"object [{t.pathname}] is not a table, and cannot be used in all "
1028
"select as multiple"
1029
)
1030
1031
if nrows is None:
1032
nrows = t.nrows
1033
elif t.nrows != nrows:
1034
raise ValueError("all tables must have exactly the same nrows!")
1035
1036
# The isinstance checks here are redundant with the check above,
1037
# but necessary for mypy; see GH#29757
1038
_tbls = [x for x in tbls if isinstance(x, Table)]
1039
1040
# axis is the concentration axes
1041
axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
1042
1043
def func(_start, _stop, _where):
1044
1045
# retrieve the objs, _where is always passed as a set of
1046
# coordinates here
1047
objs = [
1048
t.read(where=_where, columns=columns, start=_start, stop=_stop)
1049
for t in tbls
1050
]
1051
1052
# concat and return
1053
return concat(objs, axis=axis, verify_integrity=False)._consolidate()
1054
1055
# create the iterator
1056
it = TableIterator(
1057
self,
1058
s,
1059
func,
1060
where=where,
1061
nrows=nrows,
1062
start=start,
1063
stop=stop,
1064
iterator=iterator,
1065
chunksize=chunksize,
1066
auto_close=auto_close,
1067
)
1068
1069
return it.get_result(coordinates=True)
1070
1071
def put(
1072
self,
1073
key: str,
1074
value: DataFrame | Series,
1075
format=None,
1076
index=True,
1077
append=False,
1078
complib=None,
1079
complevel: int | None = None,
1080
min_itemsize: int | dict[str, int] | None = None,
1081
nan_rep=None,
1082
data_columns: Literal[True] | list[str] | None = None,
1083
encoding=None,
1084
errors: str = "strict",
1085
track_times: bool = True,
1086
dropna: bool = False,
1087
):
1088
"""
1089
Store object in HDFStore.
1090
1091
Parameters
1092
----------
1093
key : str
1094
value : {Series, DataFrame}
1095
format : 'fixed(f)|table(t)', default is 'fixed'
1096
Format to use when storing object in HDFStore. Value can be one of:
1097
1098
``'fixed'``
1099
Fixed format. Fast writing/reading. Not-appendable, nor searchable.
1100
``'table'``
1101
Table format. Write as a PyTables Table structure which may perform
1102
worse but allow more flexible operations like searching / selecting
1103
subsets of the data.
1104
append : bool, default False
1105
This will force Table format, append the input data to the existing.
1106
data_columns : list of columns or True, default None
1107
List of columns to create as data columns, or True to use all columns.
1108
See `here
1109
<https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1110
encoding : str, default None
1111
Provide an encoding for strings.
1112
track_times : bool, default True
1113
Parameter is propagated to 'create_table' method of 'PyTables'.
1114
If set to False it enables to have the same h5 files (same hashes)
1115
independent on creation time.
1116
1117
.. versionadded:: 1.1.0
1118
"""
1119
if format is None:
1120
format = get_option("io.hdf.default_format") or "fixed"
1121
format = self._validate_format(format)
1122
self._write_to_group(
1123
key,
1124
value,
1125
format=format,
1126
index=index,
1127
append=append,
1128
complib=complib,
1129
complevel=complevel,
1130
min_itemsize=min_itemsize,
1131
nan_rep=nan_rep,
1132
data_columns=data_columns,
1133
encoding=encoding,
1134
errors=errors,
1135
track_times=track_times,
1136
dropna=dropna,
1137
)
1138
1139
def remove(self, key: str, where=None, start=None, stop=None):
1140
"""
1141
Remove pandas object partially by specifying the where condition
1142
1143
Parameters
1144
----------
1145
key : str
1146
Node to remove or delete rows from
1147
where : list of Term (or convertible) objects, optional
1148
start : integer (defaults to None), row number to start selection
1149
stop : integer (defaults to None), row number to stop selection
1150
1151
Returns
1152
-------
1153
number of rows removed (or None if not a Table)
1154
1155
Raises
1156
------
1157
raises KeyError if key is not a valid store
1158
1159
"""
1160
where = _ensure_term(where, scope_level=1)
1161
try:
1162
s = self.get_storer(key)
1163
except KeyError:
1164
# the key is not a valid store, re-raising KeyError
1165
raise
1166
except AssertionError:
1167
# surface any assertion errors for e.g. debugging
1168
raise
1169
except Exception as err:
1170
# In tests we get here with ClosedFileError, TypeError, and
1171
# _table_mod.NoSuchNodeError. TODO: Catch only these?
1172
1173
if where is not None:
1174
raise ValueError(
1175
"trying to remove a node with a non-None where clause!"
1176
) from err
1177
1178
# we are actually trying to remove a node (with children)
1179
node = self.get_node(key)
1180
if node is not None:
1181
node._f_remove(recursive=True)
1182
return None
1183
1184
# remove the node
1185
if com.all_none(where, start, stop):
1186
s.group._f_remove(recursive=True)
1187
1188
# delete from the table
1189
else:
1190
if not s.is_table:
1191
raise ValueError(
1192
"can only remove with where on objects written as tables"
1193
)
1194
return s.delete(where=where, start=start, stop=stop)
1195
1196
def append(
1197
self,
1198
key: str,
1199
value: DataFrame | Series,
1200
format=None,
1201
axes=None,
1202
index=True,
1203
append=True,
1204
complib=None,
1205
complevel: int | None = None,
1206
columns=None,
1207
min_itemsize: int | dict[str, int] | None = None,
1208
nan_rep=None,
1209
chunksize=None,
1210
expectedrows=None,
1211
dropna: bool | None = None,
1212
data_columns: Literal[True] | list[str] | None = None,
1213
encoding=None,
1214
errors: str = "strict",
1215
):
1216
"""
1217
Append to Table in file. Node must already exist and be Table
1218
format.
1219
1220
Parameters
1221
----------
1222
key : str
1223
value : {Series, DataFrame}
1224
format : 'table' is the default
1225
Format to use when storing object in HDFStore. Value can be one of:
1226
1227
``'table'``
1228
Table format. Write as a PyTables Table structure which may perform
1229
worse but allow more flexible operations like searching / selecting
1230
subsets of the data.
1231
append : bool, default True
1232
Append the input data to the existing.
1233
data_columns : list of columns, or True, default None
1234
List of columns to create as indexed data columns for on-disk
1235
queries, or True to use all columns. By default only the axes
1236
of the object are indexed. See `here
1237
<https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1238
min_itemsize : dict of columns that specify minimum str sizes
1239
nan_rep : str to use as str nan representation
1240
chunksize : size to chunk the writing
1241
expectedrows : expected TOTAL row size of this table
1242
encoding : default None, provide an encoding for str
1243
dropna : bool, default False
1244
Do not write an ALL nan row to the store settable
1245
by the option 'io.hdf.dropna_table'.
1246
1247
Notes
1248
-----
1249
Does *not* check if data being appended overlaps with existing
1250
data in the table, so be careful
1251
"""
1252
if columns is not None:
1253
raise TypeError(
1254
"columns is not a supported keyword in append, try data_columns"
1255
)
1256
1257
if dropna is None:
1258
dropna = get_option("io.hdf.dropna_table")
1259
if format is None:
1260
format = get_option("io.hdf.default_format") or "table"
1261
format = self._validate_format(format)
1262
self._write_to_group(
1263
key,
1264
value,
1265
format=format,
1266
axes=axes,
1267
index=index,
1268
append=append,
1269
complib=complib,
1270
complevel=complevel,
1271
min_itemsize=min_itemsize,
1272
nan_rep=nan_rep,
1273
chunksize=chunksize,
1274
expectedrows=expectedrows,
1275
dropna=dropna,
1276
data_columns=data_columns,
1277
encoding=encoding,
1278
errors=errors,
1279
)
1280
1281
def append_to_multiple(
1282
self,
1283
d: dict,
1284
value,
1285
selector,
1286
data_columns=None,
1287
axes=None,
1288
dropna=False,
1289
**kwargs,
1290
):
1291
"""
1292
Append to multiple tables
1293
1294
Parameters
1295
----------
1296
d : a dict of table_name to table_columns, None is acceptable as the
1297
values of one node (this will get all the remaining columns)
1298
value : a pandas object
1299
selector : a string that designates the indexable table; all of its
1300
columns will be designed as data_columns, unless data_columns is
1301
passed, in which case these are used
1302
data_columns : list of columns to create as data columns, or True to
1303
use all columns
1304
dropna : if evaluates to True, drop rows from all tables if any single
1305
row in each table has all NaN. Default False.
1306
1307
Notes
1308
-----
1309
axes parameter is currently not accepted
1310
1311
"""
1312
if axes is not None:
1313
raise TypeError(
1314
"axes is currently not accepted as a parameter to append_to_multiple; "
1315
"you can create the tables independently instead"
1316
)
1317
1318
if not isinstance(d, dict):
1319
raise ValueError(
1320
"append_to_multiple must have a dictionary specified as the "
1321
"way to split the value"
1322
)
1323
1324
if selector not in d:
1325
raise ValueError(
1326
"append_to_multiple requires a selector that is in passed dict"
1327
)
1328
1329
# figure out the splitting axis (the non_index_axis)
1330
axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
1331
1332
# figure out how to split the value
1333
remain_key = None
1334
remain_values: list = []
1335
for k, v in d.items():
1336
if v is None:
1337
if remain_key is not None:
1338
raise ValueError(
1339
"append_to_multiple can only have one value in d that is None"
1340
)
1341
remain_key = k
1342
else:
1343
remain_values.extend(v)
1344
if remain_key is not None:
1345
ordered = value.axes[axis]
1346
ordd = ordered.difference(Index(remain_values))
1347
ordd = sorted(ordered.get_indexer(ordd))
1348
d[remain_key] = ordered.take(ordd)
1349
1350
# data_columns
1351
if data_columns is None:
1352
data_columns = d[selector]
1353
1354
# ensure rows are synchronized across the tables
1355
if dropna:
1356
idxs = (value[cols].dropna(how="all").index for cols in d.values())
1357
valid_index = next(idxs)
1358
for index in idxs:
1359
valid_index = valid_index.intersection(index)
1360
value = value.loc[valid_index]
1361
1362
min_itemsize = kwargs.pop("min_itemsize", None)
1363
1364
# append
1365
for k, v in d.items():
1366
dc = data_columns if k == selector else None
1367
1368
# compute the val
1369
val = value.reindex(v, axis=axis)
1370
1371
filtered = (
1372
{key: value for (key, value) in min_itemsize.items() if key in v}
1373
if min_itemsize is not None
1374
else None
1375
)
1376
self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
1377
1378
def create_table_index(
1379
self,
1380
key: str,
1381
columns=None,
1382
optlevel: int | None = None,
1383
kind: str | None = None,
1384
):
1385
"""
1386
Create a pytables index on the table.
1387
1388
Parameters
1389
----------
1390
key : str
1391
columns : None, bool, or listlike[str]
1392
Indicate which columns to create an index on.
1393
1394
* False : Do not create any indexes.
1395
* True : Create indexes on all columns.
1396
* None : Create indexes on all columns.
1397
* listlike : Create indexes on the given columns.
1398
1399
optlevel : int or None, default None
1400
Optimization level, if None, pytables defaults to 6.
1401
kind : str or None, default None
1402
Kind of index, if None, pytables defaults to "medium".
1403
1404
Raises
1405
------
1406
TypeError: raises if the node is not a table
1407
"""
1408
# version requirements
1409
_tables()
1410
s = self.get_storer(key)
1411
if s is None:
1412
return
1413
1414
if not isinstance(s, Table):
1415
raise TypeError("cannot create table index on a Fixed format store")
1416
s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1417
1418
def groups(self):
1419
"""
1420
Return a list of all the top-level nodes.
1421
1422
Each node returned is not a pandas storage object.
1423
1424
Returns
1425
-------
1426
list
1427
List of objects.
1428
"""
1429
_tables()
1430
self._check_if_open()
1431
assert self._handle is not None # for mypy
1432
assert _table_mod is not None # for mypy
1433
return [
1434
g
1435
for g in self._handle.walk_groups()
1436
if (
1437
not isinstance(g, _table_mod.link.Link)
1438
and (
1439
getattr(g._v_attrs, "pandas_type", None)
1440
or getattr(g, "table", None)
1441
or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1442
)
1443
)
1444
]
1445
1446
def walk(self, where="/"):
1447
"""
1448
Walk the pytables group hierarchy for pandas objects.
1449
1450
This generator will yield the group path, subgroups and pandas object
1451
names for each group.
1452
1453
Any non-pandas PyTables objects that are not a group will be ignored.
1454
1455
The `where` group itself is listed first (preorder), then each of its
1456
child groups (following an alphanumerical order) is also traversed,
1457
following the same procedure.
1458
1459
Parameters
1460
----------
1461
where : str, default "/"
1462
Group where to start walking.
1463
1464
Yields
1465
------
1466
path : str
1467
Full path to a group (without trailing '/').
1468
groups : list
1469
Names (strings) of the groups contained in `path`.
1470
leaves : list
1471
Names (strings) of the pandas objects contained in `path`.
1472
"""
1473
_tables()
1474
self._check_if_open()
1475
assert self._handle is not None # for mypy
1476
assert _table_mod is not None # for mypy
1477
1478
for g in self._handle.walk_groups(where):
1479
if getattr(g._v_attrs, "pandas_type", None) is not None:
1480
continue
1481
1482
groups = []
1483
leaves = []
1484
for child in g._v_children.values():
1485
pandas_type = getattr(child._v_attrs, "pandas_type", None)
1486
if pandas_type is None:
1487
if isinstance(child, _table_mod.group.Group):
1488
groups.append(child._v_name)
1489
else:
1490
leaves.append(child._v_name)
1491
1492
yield (g._v_pathname.rstrip("/"), groups, leaves)
1493
1494
def get_node(self, key: str) -> Node | None:
1495
"""return the node with the key or None if it does not exist"""
1496
self._check_if_open()
1497
if not key.startswith("/"):
1498
key = "/" + key
1499
1500
assert self._handle is not None
1501
assert _table_mod is not None # for mypy
1502
try:
1503
node = self._handle.get_node(self.root, key)
1504
except _table_mod.exceptions.NoSuchNodeError:
1505
return None
1506
1507
assert isinstance(node, _table_mod.Node), type(node)
1508
return node
1509
1510
def get_storer(self, key: str) -> GenericFixed | Table:
1511
"""return the storer object for a key, raise if not in the file"""
1512
group = self.get_node(key)
1513
if group is None:
1514
raise KeyError(f"No object named {key} in the file")
1515
1516
s = self._create_storer(group)
1517
s.infer_axes()
1518
return s
1519
1520
def copy(
1521
self,
1522
file,
1523
mode="w",
1524
propindexes: bool = True,
1525
keys=None,
1526
complib=None,
1527
complevel: int | None = None,
1528
fletcher32: bool = False,
1529
overwrite=True,
1530
):
1531
"""
1532
Copy the existing store to a new file, updating in place.
1533
1534
Parameters
1535
----------
1536
propindexes : bool, default True
1537
Restore indexes in copied file.
1538
keys : list, optional
1539
List of keys to include in the copy (defaults to all).
1540
overwrite : bool, default True
1541
Whether to overwrite (remove and replace) existing nodes in the new store.
1542
mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1543
1544
Returns
1545
-------
1546
open file handle of the new store
1547
"""
1548
new_store = HDFStore(
1549
file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1550
)
1551
if keys is None:
1552
keys = list(self.keys())
1553
if not isinstance(keys, (tuple, list)):
1554
keys = [keys]
1555
for k in keys:
1556
s = self.get_storer(k)
1557
if s is not None:
1558
1559
if k in new_store:
1560
if overwrite:
1561
new_store.remove(k)
1562
1563
data = self.select(k)
1564
if isinstance(s, Table):
1565
1566
index: bool | list[str] = False
1567
if propindexes:
1568
index = [a.name for a in s.axes if a.is_indexed]
1569
new_store.append(
1570
k,
1571
data,
1572
index=index,
1573
data_columns=getattr(s, "data_columns", None),
1574
encoding=s.encoding,
1575
)
1576
else:
1577
new_store.put(k, data, encoding=s.encoding)
1578
1579
return new_store
1580
1581
def info(self) -> str:
1582
"""
1583
Print detailed information on the store.
1584
1585
Returns
1586
-------
1587
str
1588
"""
1589
path = pprint_thing(self._path)
1590
output = f"{type(self)}\nFile path: {path}\n"
1591
1592
if self.is_open:
1593
lkeys = sorted(self.keys())
1594
if len(lkeys):
1595
keys = []
1596
values = []
1597
1598
for k in lkeys:
1599
try:
1600
s = self.get_storer(k)
1601
if s is not None:
1602
keys.append(pprint_thing(s.pathname or k))
1603
values.append(pprint_thing(s or "invalid_HDFStore node"))
1604
except AssertionError:
1605
# surface any assertion errors for e.g. debugging
1606
raise
1607
except Exception as detail:
1608
keys.append(k)
1609
dstr = pprint_thing(detail)
1610
values.append(f"[invalid_HDFStore node: {dstr}]")
1611
1612
output += adjoin(12, keys, values)
1613
else:
1614
output += "Empty"
1615
else:
1616
output += "File is CLOSED"
1617
1618
return output
1619
1620
# ------------------------------------------------------------------------
1621
# private methods
1622
1623
def _check_if_open(self):
1624
if not self.is_open:
1625
raise ClosedFileError(f"{self._path} file is not open!")
1626
1627
def _validate_format(self, format: str) -> str:
1628
"""validate / deprecate formats"""
1629
# validate
1630
try:
1631
format = _FORMAT_MAP[format.lower()]
1632
except KeyError as err:
1633
raise TypeError(f"invalid HDFStore format specified [{format}]") from err
1634
1635
return format
1636
1637
def _create_storer(
1638
self,
1639
group,
1640
format=None,
1641
value: DataFrame | Series | None = None,
1642
encoding: str = "UTF-8",
1643
errors: str = "strict",
1644
) -> GenericFixed | Table:
1645
"""return a suitable class to operate"""
1646
cls: type[GenericFixed] | type[Table]
1647
1648
if value is not None and not isinstance(value, (Series, DataFrame)):
1649
raise TypeError("value must be None, Series, or DataFrame")
1650
1651
def error(t):
1652
# return instead of raising so mypy can tell where we are raising
1653
return TypeError(
1654
f"cannot properly create the storer for: [{t}] [group->"
1655
f"{group},value->{type(value)},format->{format}"
1656
)
1657
1658
pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1659
tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1660
1661
# infer the pt from the passed value
1662
if pt is None:
1663
if value is None:
1664
_tables()
1665
assert _table_mod is not None # for mypy
1666
if getattr(group, "table", None) or isinstance(
1667
group, _table_mod.table.Table
1668
):
1669
pt = "frame_table"
1670
tt = "generic_table"
1671
else:
1672
raise TypeError(
1673
"cannot create a storer if the object is not existing "
1674
"nor a value are passed"
1675
)
1676
else:
1677
if isinstance(value, Series):
1678
pt = "series"
1679
else:
1680
pt = "frame"
1681
1682
# we are actually a table
1683
if format == "table":
1684
pt += "_table"
1685
1686
# a storer node
1687
if "table" not in pt:
1688
_STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1689
try:
1690
cls = _STORER_MAP[pt]
1691
except KeyError as err:
1692
raise error("_STORER_MAP") from err
1693
return cls(self, group, encoding=encoding, errors=errors)
1694
1695
# existing node (and must be a table)
1696
if tt is None:
1697
# if we are a writer, determine the tt
1698
if value is not None:
1699
if pt == "series_table":
1700
index = getattr(value, "index", None)
1701
if index is not None:
1702
if index.nlevels == 1:
1703
tt = "appendable_series"
1704
elif index.nlevels > 1:
1705
tt = "appendable_multiseries"
1706
elif pt == "frame_table":
1707
index = getattr(value, "index", None)
1708
if index is not None:
1709
if index.nlevels == 1:
1710
tt = "appendable_frame"
1711
elif index.nlevels > 1:
1712
tt = "appendable_multiframe"
1713
1714
_TABLE_MAP = {
1715
"generic_table": GenericTable,
1716
"appendable_series": AppendableSeriesTable,
1717
"appendable_multiseries": AppendableMultiSeriesTable,
1718
"appendable_frame": AppendableFrameTable,
1719
"appendable_multiframe": AppendableMultiFrameTable,
1720
"worm": WORMTable,
1721
}
1722
try:
1723
cls = _TABLE_MAP[tt]
1724
except KeyError as err:
1725
raise error("_TABLE_MAP") from err
1726
1727
return cls(self, group, encoding=encoding, errors=errors)
1728
1729
def _write_to_group(
1730
self,
1731
key: str,
1732
value: DataFrame | Series,
1733
format,
1734
axes=None,
1735
index=True,
1736
append=False,
1737
complib=None,
1738
complevel: int | None = None,
1739
fletcher32=None,
1740
min_itemsize: int | dict[str, int] | None = None,
1741
chunksize=None,
1742
expectedrows=None,
1743
dropna=False,
1744
nan_rep=None,
1745
data_columns=None,
1746
encoding=None,
1747
errors: str = "strict",
1748
track_times: bool = True,
1749
) -> None:
1750
# we don't want to store a table node at all if our object is 0-len
1751
# as there are not dtypes
1752
if getattr(value, "empty", None) and (format == "table" or append):
1753
return
1754
1755
group = self._identify_group(key, append)
1756
1757
s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1758
if append:
1759
# raise if we are trying to append to a Fixed format,
1760
# or a table that exists (and we are putting)
1761
if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1762
raise ValueError("Can only append to Tables")
1763
if not s.is_exists:
1764
s.set_object_info()
1765
else:
1766
s.set_object_info()
1767
1768
if not s.is_table and complib:
1769
raise ValueError("Compression not supported on Fixed format stores")
1770
1771
# write the object
1772
s.write(
1773
obj=value,
1774
axes=axes,
1775
append=append,
1776
complib=complib,
1777
complevel=complevel,
1778
fletcher32=fletcher32,
1779
min_itemsize=min_itemsize,
1780
chunksize=chunksize,
1781
expectedrows=expectedrows,
1782
dropna=dropna,
1783
nan_rep=nan_rep,
1784
data_columns=data_columns,
1785
track_times=track_times,
1786
)
1787
1788
if isinstance(s, Table) and index:
1789
s.create_index(columns=index)
1790
1791
def _read_group(self, group: Node):
1792
s = self._create_storer(group)
1793
s.infer_axes()
1794
return s.read()
1795
1796
def _identify_group(self, key: str, append: bool) -> Node:
1797
"""Identify HDF5 group based on key, delete/create group if needed."""
1798
group = self.get_node(key)
1799
1800
# we make this assertion for mypy; the get_node call will already
1801
# have raised if this is incorrect
1802
assert self._handle is not None
1803
1804
# remove the node if we are not appending
1805
if group is not None and not append:
1806
self._handle.remove_node(group, recursive=True)
1807
group = None
1808
1809
if group is None:
1810
group = self._create_nodes_and_group(key)
1811
1812
return group
1813
1814
def _create_nodes_and_group(self, key: str) -> Node:
1815
"""Create nodes from key and return group name."""
1816
# assertion for mypy
1817
assert self._handle is not None
1818
1819
paths = key.split("/")
1820
# recursively create the groups
1821
path = "/"
1822
for p in paths:
1823
if not len(p):
1824
continue
1825
new_path = path
1826
if not path.endswith("/"):
1827
new_path += "/"
1828
new_path += p
1829
group = self.get_node(new_path)
1830
if group is None:
1831
group = self._handle.create_group(path, p)
1832
path = new_path
1833
return group
1834
1835
1836
class TableIterator:
1837
"""
1838
Define the iteration interface on a table
1839
1840
Parameters
1841
----------
1842
store : HDFStore
1843
s : the referred storer
1844
func : the function to execute the query
1845
where : the where of the query
1846
nrows : the rows to iterate on
1847
start : the passed start value (default is None)
1848
stop : the passed stop value (default is None)
1849
iterator : bool, default False
1850
Whether to use the default iterator.
1851
chunksize : the passed chunking value (default is 100000)
1852
auto_close : bool, default False
1853
Whether to automatically close the store at the end of iteration.
1854
"""
1855
1856
chunksize: int | None
1857
store: HDFStore
1858
s: GenericFixed | Table
1859
1860
def __init__(
1861
self,
1862
store: HDFStore,
1863
s: GenericFixed | Table,
1864
func,
1865
where,
1866
nrows,
1867
start=None,
1868
stop=None,
1869
iterator: bool = False,
1870
chunksize: int | None = None,
1871
auto_close: bool = False,
1872
):
1873
self.store = store
1874
self.s = s
1875
self.func = func
1876
self.where = where
1877
1878
# set start/stop if they are not set if we are a table
1879
if self.s.is_table:
1880
if nrows is None:
1881
nrows = 0
1882
if start is None:
1883
start = 0
1884
if stop is None:
1885
stop = nrows
1886
stop = min(nrows, stop)
1887
1888
self.nrows = nrows
1889
self.start = start
1890
self.stop = stop
1891
1892
self.coordinates = None
1893
if iterator or chunksize is not None:
1894
if chunksize is None:
1895
chunksize = 100000
1896
self.chunksize = int(chunksize)
1897
else:
1898
self.chunksize = None
1899
1900
self.auto_close = auto_close
1901
1902
def __iter__(self):
1903
# iterate
1904
current = self.start
1905
if self.coordinates is None:
1906
raise ValueError("Cannot iterate until get_result is called.")
1907
while current < self.stop:
1908
stop = min(current + self.chunksize, self.stop)
1909
value = self.func(None, None, self.coordinates[current:stop])
1910
current = stop
1911
if value is None or not len(value):
1912
continue
1913
1914
yield value
1915
1916
self.close()
1917
1918
def close(self):
1919
if self.auto_close:
1920
self.store.close()
1921
1922
def get_result(self, coordinates: bool = False):
1923
# return the actual iterator
1924
if self.chunksize is not None:
1925
if not isinstance(self.s, Table):
1926
raise TypeError("can only use an iterator or chunksize on a table")
1927
1928
self.coordinates = self.s.read_coordinates(where=self.where)
1929
1930
return self
1931
1932
# if specified read via coordinates (necessary for multiple selections
1933
if coordinates:
1934
if not isinstance(self.s, Table):
1935
raise TypeError("can only read_coordinates on a table")
1936
where = self.s.read_coordinates(
1937
where=self.where, start=self.start, stop=self.stop
1938
)
1939
else:
1940
where = self.where
1941
1942
# directly return the result
1943
results = self.func(self.start, self.stop, where)
1944
self.close()
1945
return results
1946
1947
1948
class IndexCol:
1949
"""
1950
an index column description class
1951
1952
Parameters
1953
----------
1954
axis : axis which I reference
1955
values : the ndarray like converted values
1956
kind : a string description of this type
1957
typ : the pytables type
1958
pos : the position in the pytables
1959
1960
"""
1961
1962
is_an_indexable = True
1963
is_data_indexable = True
1964
_info_fields = ["freq", "tz", "index_name"]
1965
1966
name: str
1967
cname: str
1968
1969
def __init__(
1970
self,
1971
name: str,
1972
values=None,
1973
kind=None,
1974
typ=None,
1975
cname: str | None = None,
1976
axis=None,
1977
pos=None,
1978
freq=None,
1979
tz=None,
1980
index_name=None,
1981
ordered=None,
1982
table=None,
1983
meta=None,
1984
metadata=None,
1985
):
1986
1987
if not isinstance(name, str):
1988
raise ValueError("`name` must be a str.")
1989
1990
self.values = values
1991
self.kind = kind
1992
self.typ = typ
1993
self.name = name
1994
self.cname = cname or name
1995
self.axis = axis
1996
self.pos = pos
1997
self.freq = freq
1998
self.tz = tz
1999
self.index_name = index_name
2000
self.ordered = ordered
2001
self.table = table
2002
self.meta = meta
2003
self.metadata = metadata
2004
2005
if pos is not None:
2006
self.set_pos(pos)
2007
2008
# These are ensured as long as the passed arguments match the
2009
# constructor annotations.
2010
assert isinstance(self.name, str)
2011
assert isinstance(self.cname, str)
2012
2013
@property
2014
def itemsize(self) -> int:
2015
# Assumes self.typ has already been initialized
2016
return self.typ.itemsize
2017
2018
@property
2019
def kind_attr(self) -> str:
2020
return f"{self.name}_kind"
2021
2022
def set_pos(self, pos: int):
2023
"""set the position of this column in the Table"""
2024
self.pos = pos
2025
if pos is not None and self.typ is not None:
2026
self.typ._v_pos = pos
2027
2028
def __repr__(self) -> str:
2029
temp = tuple(
2030
map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
2031
)
2032
return ",".join(
2033
[
2034
f"{key}->{value}"
2035
for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
2036
]
2037
)
2038
2039
def __eq__(self, other: Any) -> bool:
2040
"""compare 2 col items"""
2041
return all(
2042
getattr(self, a, None) == getattr(other, a, None)
2043
for a in ["name", "cname", "axis", "pos"]
2044
)
2045
2046
def __ne__(self, other) -> bool:
2047
return not self.__eq__(other)
2048
2049
@property
2050
def is_indexed(self) -> bool:
2051
"""return whether I am an indexed column"""
2052
if not hasattr(self.table, "cols"):
2053
# e.g. if infer hasn't been called yet, self.table will be None.
2054
return False
2055
return getattr(self.table.cols, self.cname).is_indexed
2056
2057
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2058
"""
2059
Convert the data from this selection to the appropriate pandas type.
2060
"""
2061
assert isinstance(values, np.ndarray), type(values)
2062
2063
# values is a recarray
2064
if values.dtype.fields is not None:
2065
values = values[self.cname]
2066
2067
val_kind = _ensure_decoded(self.kind)
2068
values = _maybe_convert(values, val_kind, encoding, errors)
2069
2070
kwargs = {}
2071
kwargs["name"] = _ensure_decoded(self.index_name)
2072
2073
if self.freq is not None:
2074
kwargs["freq"] = _ensure_decoded(self.freq)
2075
2076
factory: type[Index] | type[DatetimeIndex] = Index
2077
if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
2078
factory = DatetimeIndex
2079
elif values.dtype == "i8" and "freq" in kwargs:
2080
# PeriodIndex data is stored as i8
2081
# error: Incompatible types in assignment (expression has type
2082
# "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
2083
# "Union[Type[Index], Type[DatetimeIndex]]")
2084
factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
2085
ordinal=x, **kwds
2086
)
2087
2088
# making an Index instance could throw a number of different errors
2089
try:
2090
new_pd_index = factory(values, **kwargs)
2091
except ValueError:
2092
# if the output freq is different that what we recorded,
2093
# it should be None (see also 'doc example part 2')
2094
if "freq" in kwargs:
2095
kwargs["freq"] = None
2096
new_pd_index = factory(values, **kwargs)
2097
final_pd_index = _set_tz(new_pd_index, self.tz)
2098
return final_pd_index, final_pd_index
2099
2100
def take_data(self):
2101
"""return the values"""
2102
return self.values
2103
2104
@property
2105
def attrs(self):
2106
return self.table._v_attrs
2107
2108
@property
2109
def description(self):
2110
return self.table.description
2111
2112
@property
2113
def col(self):
2114
"""return my current col description"""
2115
return getattr(self.description, self.cname, None)
2116
2117
@property
2118
def cvalues(self):
2119
"""return my cython values"""
2120
return self.values
2121
2122
def __iter__(self):
2123
return iter(self.values)
2124
2125
def maybe_set_size(self, min_itemsize=None):
2126
"""
2127
maybe set a string col itemsize:
2128
min_itemsize can be an integer or a dict with this columns name
2129
with an integer size
2130
"""
2131
if _ensure_decoded(self.kind) == "string":
2132
if isinstance(min_itemsize, dict):
2133
min_itemsize = min_itemsize.get(self.name)
2134
2135
if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2136
self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2137
2138
def validate_names(self):
2139
pass
2140
2141
def validate_and_set(self, handler: AppendableTable, append: bool):
2142
self.table = handler.table
2143
self.validate_col()
2144
self.validate_attr(append)
2145
self.validate_metadata(handler)
2146
self.write_metadata(handler)
2147
self.set_attr()
2148
2149
def validate_col(self, itemsize=None):
2150
"""validate this column: return the compared against itemsize"""
2151
# validate this column for string truncation (or reset to the max size)
2152
if _ensure_decoded(self.kind) == "string":
2153
c = self.col
2154
if c is not None:
2155
if itemsize is None:
2156
itemsize = self.itemsize
2157
if c.itemsize < itemsize:
2158
raise ValueError(
2159
f"Trying to store a string with len [{itemsize}] in "
2160
f"[{self.cname}] column but\nthis column has a limit of "
2161
f"[{c.itemsize}]!\nConsider using min_itemsize to "
2162
"preset the sizes on these columns"
2163
)
2164
return c.itemsize
2165
2166
return None
2167
2168
def validate_attr(self, append: bool):
2169
# check for backwards incompatibility
2170
if append:
2171
existing_kind = getattr(self.attrs, self.kind_attr, None)
2172
if existing_kind is not None and existing_kind != self.kind:
2173
raise TypeError(
2174
f"incompatible kind in col [{existing_kind} - {self.kind}]"
2175
)
2176
2177
def update_info(self, info):
2178
"""
2179
set/update the info for this indexable with the key/value
2180
if there is a conflict raise/warn as needed
2181
"""
2182
for key in self._info_fields:
2183
2184
value = getattr(self, key, None)
2185
idx = info.setdefault(self.name, {})
2186
2187
existing_value = idx.get(key)
2188
if key in idx and value is not None and existing_value != value:
2189
# frequency/name just warn
2190
if key in ["freq", "index_name"]:
2191
ws = attribute_conflict_doc % (key, existing_value, value)
2192
warnings.warn(
2193
ws, AttributeConflictWarning, stacklevel=find_stack_level()
2194
)
2195
2196
# reset
2197
idx[key] = None
2198
setattr(self, key, None)
2199
2200
else:
2201
raise ValueError(
2202
f"invalid info for [{self.name}] for [{key}], "
2203
f"existing_value [{existing_value}] conflicts with "
2204
f"new value [{value}]"
2205
)
2206
else:
2207
if value is not None or existing_value is not None:
2208
idx[key] = value
2209
2210
def set_info(self, info):
2211
"""set my state from the passed info"""
2212
idx = info.get(self.name)
2213
if idx is not None:
2214
self.__dict__.update(idx)
2215
2216
def set_attr(self):
2217
"""set the kind for this column"""
2218
setattr(self.attrs, self.kind_attr, self.kind)
2219
2220
def validate_metadata(self, handler: AppendableTable):
2221
"""validate that kind=category does not change the categories"""
2222
if self.meta == "category":
2223
new_metadata = self.metadata
2224
cur_metadata = handler.read_metadata(self.cname)
2225
if (
2226
new_metadata is not None
2227
and cur_metadata is not None
2228
and not array_equivalent(new_metadata, cur_metadata)
2229
):
2230
raise ValueError(
2231
"cannot append a categorical with "
2232
"different categories to the existing"
2233
)
2234
2235
def write_metadata(self, handler: AppendableTable):
2236
"""set the meta data"""
2237
if self.metadata is not None:
2238
handler.write_metadata(self.cname, self.metadata)
2239
2240
2241
class GenericIndexCol(IndexCol):
2242
"""an index which is not represented in the data of the table"""
2243
2244
@property
2245
def is_indexed(self) -> bool:
2246
return False
2247
2248
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2249
"""
2250
Convert the data from this selection to the appropriate pandas type.
2251
2252
Parameters
2253
----------
2254
values : np.ndarray
2255
nan_rep : str
2256
encoding : str
2257
errors : str
2258
"""
2259
assert isinstance(values, np.ndarray), type(values)
2260
2261
# error: Incompatible types in assignment (expression has type
2262
# "Int64Index", variable has type "ndarray")
2263
values = Int64Index(np.arange(len(values))) # type: ignore[assignment]
2264
return values, values
2265
2266
def set_attr(self):
2267
pass
2268
2269
2270
class DataCol(IndexCol):
2271
"""
2272
a data holding column, by definition this is not indexable
2273
2274
Parameters
2275
----------
2276
data : the actual data
2277
cname : the column name in the table to hold the data (typically
2278
values)
2279
meta : a string description of the metadata
2280
metadata : the actual metadata
2281
"""
2282
2283
is_an_indexable = False
2284
is_data_indexable = False
2285
_info_fields = ["tz", "ordered"]
2286
2287
def __init__(
2288
self,
2289
name: str,
2290
values=None,
2291
kind=None,
2292
typ=None,
2293
cname=None,
2294
pos=None,
2295
tz=None,
2296
ordered=None,
2297
table=None,
2298
meta=None,
2299
metadata=None,
2300
dtype: DtypeArg | None = None,
2301
data=None,
2302
):
2303
super().__init__(
2304
name=name,
2305
values=values,
2306
kind=kind,
2307
typ=typ,
2308
pos=pos,
2309
cname=cname,
2310
tz=tz,
2311
ordered=ordered,
2312
table=table,
2313
meta=meta,
2314
metadata=metadata,
2315
)
2316
self.dtype = dtype
2317
self.data = data
2318
2319
@property
2320
def dtype_attr(self) -> str:
2321
return f"{self.name}_dtype"
2322
2323
@property
2324
def meta_attr(self) -> str:
2325
return f"{self.name}_meta"
2326
2327
def __repr__(self) -> str:
2328
temp = tuple(
2329
map(
2330
pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2331
)
2332
)
2333
return ",".join(
2334
[
2335
f"{key}->{value}"
2336
for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2337
]
2338
)
2339
2340
def __eq__(self, other: Any) -> bool:
2341
"""compare 2 col items"""
2342
return all(
2343
getattr(self, a, None) == getattr(other, a, None)
2344
for a in ["name", "cname", "dtype", "pos"]
2345
)
2346
2347
def set_data(self, data: ArrayLike):
2348
assert data is not None
2349
assert self.dtype is None
2350
2351
data, dtype_name = _get_data_and_dtype_name(data)
2352
2353
self.data = data
2354
self.dtype = dtype_name
2355
self.kind = _dtype_to_kind(dtype_name)
2356
2357
def take_data(self):
2358
"""return the data"""
2359
return self.data
2360
2361
@classmethod
2362
def _get_atom(cls, values: ArrayLike) -> Col:
2363
"""
2364
Get an appropriately typed and shaped pytables.Col object for values.
2365
"""
2366
dtype = values.dtype
2367
# error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
2368
# attribute "itemsize"
2369
itemsize = dtype.itemsize # type: ignore[union-attr]
2370
2371
shape = values.shape
2372
if values.ndim == 1:
2373
# EA, use block shape pretending it is 2D
2374
# TODO(EA2D): not necessary with 2D EAs
2375
shape = (1, values.size)
2376
2377
if isinstance(values, Categorical):
2378
codes = values.codes
2379
atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2380
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
2381
atom = cls.get_atom_datetime64(shape)
2382
elif is_timedelta64_dtype(dtype):
2383
atom = cls.get_atom_timedelta64(shape)
2384
elif is_complex_dtype(dtype):
2385
atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2386
elif is_string_dtype(dtype):
2387
atom = cls.get_atom_string(shape, itemsize)
2388
else:
2389
atom = cls.get_atom_data(shape, kind=dtype.name)
2390
2391
return atom
2392
2393
@classmethod
2394
def get_atom_string(cls, shape, itemsize):
2395
return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2396
2397
@classmethod
2398
def get_atom_coltype(cls, kind: str) -> type[Col]:
2399
"""return the PyTables column class for this column"""
2400
if kind.startswith("uint"):
2401
k4 = kind[4:]
2402
col_name = f"UInt{k4}Col"
2403
elif kind.startswith("period"):
2404
# we store as integer
2405
col_name = "Int64Col"
2406
else:
2407
kcap = kind.capitalize()
2408
col_name = f"{kcap}Col"
2409
2410
return getattr(_tables(), col_name)
2411
2412
@classmethod
2413
def get_atom_data(cls, shape, kind: str) -> Col:
2414
return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2415
2416
@classmethod
2417
def get_atom_datetime64(cls, shape):
2418
return _tables().Int64Col(shape=shape[0])
2419
2420
@classmethod
2421
def get_atom_timedelta64(cls, shape):
2422
return _tables().Int64Col(shape=shape[0])
2423
2424
@property
2425
def shape(self):
2426
return getattr(self.data, "shape", None)
2427
2428
@property
2429
def cvalues(self):
2430
"""return my cython values"""
2431
return self.data
2432
2433
def validate_attr(self, append):
2434
"""validate that we have the same order as the existing & same dtype"""
2435
if append:
2436
existing_fields = getattr(self.attrs, self.kind_attr, None)
2437
if existing_fields is not None and existing_fields != list(self.values):
2438
raise ValueError("appended items do not match existing items in table!")
2439
2440
existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2441
if existing_dtype is not None and existing_dtype != self.dtype:
2442
raise ValueError(
2443
"appended items dtype do not match existing items dtype in table!"
2444
)
2445
2446
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2447
"""
2448
Convert the data from this selection to the appropriate pandas type.
2449
2450
Parameters
2451
----------
2452
values : np.ndarray
2453
nan_rep :
2454
encoding : str
2455
errors : str
2456
2457
Returns
2458
-------
2459
index : listlike to become an Index
2460
data : ndarraylike to become a column
2461
"""
2462
assert isinstance(values, np.ndarray), type(values)
2463
2464
# values is a recarray
2465
if values.dtype.fields is not None:
2466
values = values[self.cname]
2467
2468
assert self.typ is not None
2469
if self.dtype is None:
2470
# Note: in tests we never have timedelta64 or datetime64,
2471
# so the _get_data_and_dtype_name may be unnecessary
2472
converted, dtype_name = _get_data_and_dtype_name(values)
2473
kind = _dtype_to_kind(dtype_name)
2474
else:
2475
converted = values
2476
dtype_name = self.dtype
2477
kind = self.kind
2478
2479
assert isinstance(converted, np.ndarray) # for mypy
2480
2481
# use the meta if needed
2482
meta = _ensure_decoded(self.meta)
2483
metadata = self.metadata
2484
ordered = self.ordered
2485
tz = self.tz
2486
2487
assert dtype_name is not None
2488
# convert to the correct dtype
2489
dtype = _ensure_decoded(dtype_name)
2490
2491
# reverse converts
2492
if dtype == "datetime64":
2493
# recreate with tz if indicated
2494
converted = _set_tz(converted, tz, coerce=True)
2495
2496
elif dtype == "timedelta64":
2497
converted = np.asarray(converted, dtype="m8[ns]")
2498
elif dtype == "date":
2499
try:
2500
converted = np.asarray(
2501
[date.fromordinal(v) for v in converted], dtype=object
2502
)
2503
except ValueError:
2504
converted = np.asarray(
2505
[date.fromtimestamp(v) for v in converted], dtype=object
2506
)
2507
2508
elif meta == "category":
2509
# we have a categorical
2510
categories = metadata
2511
codes = converted.ravel()
2512
2513
# if we have stored a NaN in the categories
2514
# then strip it; in theory we could have BOTH
2515
# -1s in the codes and nulls :<
2516
if categories is None:
2517
# Handle case of NaN-only categorical columns in which case
2518
# the categories are an empty array; when this is stored,
2519
# pytables cannot write a zero-len array, so on readback
2520
# the categories would be None and `read_hdf()` would fail.
2521
categories = Index([], dtype=np.float64)
2522
else:
2523
mask = isna(categories)
2524
if mask.any():
2525
categories = categories[~mask]
2526
codes[codes != -1] -= mask.astype(int).cumsum()._values
2527
2528
converted = Categorical.from_codes(
2529
codes, categories=categories, ordered=ordered
2530
)
2531
2532
else:
2533
2534
try:
2535
converted = converted.astype(dtype, copy=False)
2536
except TypeError:
2537
converted = converted.astype("O", copy=False)
2538
2539
# convert nans / decode
2540
if _ensure_decoded(kind) == "string":
2541
converted = _unconvert_string_array(
2542
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2543
)
2544
2545
return self.values, converted
2546
2547
def set_attr(self):
2548
"""set the data for this column"""
2549
setattr(self.attrs, self.kind_attr, self.values)
2550
setattr(self.attrs, self.meta_attr, self.meta)
2551
assert self.dtype is not None
2552
setattr(self.attrs, self.dtype_attr, self.dtype)
2553
2554
2555
class DataIndexableCol(DataCol):
2556
"""represent a data column that can be indexed"""
2557
2558
is_data_indexable = True
2559
2560
def validate_names(self):
2561
if not Index(self.values).is_object():
2562
# TODO: should the message here be more specifically non-str?
2563
raise ValueError("cannot have non-object label DataIndexableCol")
2564
2565
@classmethod
2566
def get_atom_string(cls, shape, itemsize):
2567
return _tables().StringCol(itemsize=itemsize)
2568
2569
@classmethod
2570
def get_atom_data(cls, shape, kind: str) -> Col:
2571
return cls.get_atom_coltype(kind=kind)()
2572
2573
@classmethod
2574
def get_atom_datetime64(cls, shape):
2575
return _tables().Int64Col()
2576
2577
@classmethod
2578
def get_atom_timedelta64(cls, shape):
2579
return _tables().Int64Col()
2580
2581
2582
class GenericDataIndexableCol(DataIndexableCol):
2583
"""represent a generic pytables data column"""
2584
2585
pass
2586
2587
2588
class Fixed:
2589
"""
2590
represent an object in my store
2591
facilitate read/write of various types of objects
2592
this is an abstract base class
2593
2594
Parameters
2595
----------
2596
parent : HDFStore
2597
group : Node
2598
The group node where the table resides.
2599
"""
2600
2601
pandas_kind: str
2602
format_type: str = "fixed" # GH#30962 needed by dask
2603
obj_type: type[DataFrame | Series]
2604
ndim: int
2605
encoding: str
2606
parent: HDFStore
2607
group: Node
2608
errors: str
2609
is_table = False
2610
2611
def __init__(
2612
self,
2613
parent: HDFStore,
2614
group: Node,
2615
encoding: str = "UTF-8",
2616
errors: str = "strict",
2617
):
2618
assert isinstance(parent, HDFStore), type(parent)
2619
assert _table_mod is not None # needed for mypy
2620
assert isinstance(group, _table_mod.Node), type(group)
2621
self.parent = parent
2622
self.group = group
2623
self.encoding = _ensure_encoding(encoding)
2624
self.errors = errors
2625
2626
@property
2627
def is_old_version(self) -> bool:
2628
return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2629
2630
@property
2631
def version(self) -> tuple[int, int, int]:
2632
"""compute and set our version"""
2633
version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2634
try:
2635
version = tuple(int(x) for x in version.split("."))
2636
if len(version) == 2:
2637
version = version + (0,)
2638
except AttributeError:
2639
version = (0, 0, 0)
2640
return version
2641
2642
@property
2643
def pandas_type(self):
2644
return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2645
2646
def __repr__(self) -> str:
2647
"""return a pretty representation of myself"""
2648
self.infer_axes()
2649
s = self.shape
2650
if s is not None:
2651
if isinstance(s, (list, tuple)):
2652
jshape = ",".join([pprint_thing(x) for x in s])
2653
s = f"[{jshape}]"
2654
return f"{self.pandas_type:12.12} (shape->{s})"
2655
return self.pandas_type
2656
2657
def set_object_info(self):
2658
"""set my pandas type & version"""
2659
self.attrs.pandas_type = str(self.pandas_kind)
2660
self.attrs.pandas_version = str(_version)
2661
2662
def copy(self):
2663
new_self = copy.copy(self)
2664
return new_self
2665
2666
@property
2667
def shape(self):
2668
return self.nrows
2669
2670
@property
2671
def pathname(self):
2672
return self.group._v_pathname
2673
2674
@property
2675
def _handle(self):
2676
return self.parent._handle
2677
2678
@property
2679
def _filters(self):
2680
return self.parent._filters
2681
2682
@property
2683
def _complevel(self) -> int:
2684
return self.parent._complevel
2685
2686
@property
2687
def _fletcher32(self) -> bool:
2688
return self.parent._fletcher32
2689
2690
@property
2691
def attrs(self):
2692
return self.group._v_attrs
2693
2694
def set_attrs(self):
2695
"""set our object attributes"""
2696
pass
2697
2698
def get_attrs(self):
2699
"""get our object attributes"""
2700
pass
2701
2702
@property
2703
def storable(self):
2704
"""return my storable"""
2705
return self.group
2706
2707
@property
2708
def is_exists(self) -> bool:
2709
return False
2710
2711
@property
2712
def nrows(self):
2713
return getattr(self.storable, "nrows", None)
2714
2715
def validate(self, other):
2716
"""validate against an existing storable"""
2717
if other is None:
2718
return
2719
return True
2720
2721
def validate_version(self, where=None):
2722
"""are we trying to operate on an old version?"""
2723
return True
2724
2725
def infer_axes(self):
2726
"""
2727
infer the axes of my storer
2728
return a boolean indicating if we have a valid storer or not
2729
"""
2730
s = self.storable
2731
if s is None:
2732
return False
2733
self.get_attrs()
2734
return True
2735
2736
def read(
2737
self,
2738
where=None,
2739
columns=None,
2740
start: int | None = None,
2741
stop: int | None = None,
2742
):
2743
raise NotImplementedError(
2744
"cannot read on an abstract storer: subclasses should implement"
2745
)
2746
2747
def write(self, **kwargs):
2748
raise NotImplementedError(
2749
"cannot write on an abstract storer: subclasses should implement"
2750
)
2751
2752
def delete(self, where=None, start: int | None = None, stop: int | None = None):
2753
"""
2754
support fully deleting the node in its entirety (only) - where
2755
specification must be None
2756
"""
2757
if com.all_none(where, start, stop):
2758
self._handle.remove_node(self.group, recursive=True)
2759
return None
2760
2761
raise TypeError("cannot delete on an abstract storer")
2762
2763
2764
class GenericFixed(Fixed):
2765
"""a generified fixed version"""
2766
2767
_index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2768
_reverse_index_map = {v: k for k, v in _index_type_map.items()}
2769
attributes: list[str] = []
2770
2771
# indexer helpers
2772
def _class_to_alias(self, cls) -> str:
2773
return self._index_type_map.get(cls, "")
2774
2775
def _alias_to_class(self, alias):
2776
if isinstance(alias, type): # pragma: no cover
2777
# compat: for a short period of time master stored types
2778
return alias
2779
return self._reverse_index_map.get(alias, Index)
2780
2781
def _get_index_factory(self, attrs):
2782
index_class = self._alias_to_class(
2783
_ensure_decoded(getattr(attrs, "index_class", ""))
2784
)
2785
2786
factory: Callable
2787
2788
if index_class == DatetimeIndex:
2789
2790
def f(values, freq=None, tz=None):
2791
# data are already in UTC, localize and convert if tz present
2792
dta = DatetimeArray._simple_new(values.values, freq=freq)
2793
result = DatetimeIndex._simple_new(dta, name=None)
2794
if tz is not None:
2795
result = result.tz_localize("UTC").tz_convert(tz)
2796
return result
2797
2798
factory = f
2799
elif index_class == PeriodIndex:
2800
2801
def f(values, freq=None, tz=None):
2802
parr = PeriodArray._simple_new(values, freq=freq)
2803
return PeriodIndex._simple_new(parr, name=None)
2804
2805
factory = f
2806
else:
2807
factory = index_class
2808
2809
kwargs = {}
2810
if "freq" in attrs:
2811
kwargs["freq"] = attrs["freq"]
2812
if index_class is Index:
2813
# DTI/PI would be gotten by _alias_to_class
2814
factory = TimedeltaIndex
2815
2816
if "tz" in attrs:
2817
if isinstance(attrs["tz"], bytes):
2818
# created by python2
2819
kwargs["tz"] = attrs["tz"].decode("utf-8")
2820
else:
2821
# created by python3
2822
kwargs["tz"] = attrs["tz"]
2823
assert index_class is DatetimeIndex # just checking
2824
2825
return factory, kwargs
2826
2827
def validate_read(self, columns, where):
2828
"""
2829
raise if any keywords are passed which are not-None
2830
"""
2831
if columns is not None:
2832
raise TypeError(
2833
"cannot pass a column specification when reading "
2834
"a Fixed format store. this store must be selected in its entirety"
2835
)
2836
if where is not None:
2837
raise TypeError(
2838
"cannot pass a where specification when reading "
2839
"from a Fixed format store. this store must be selected in its entirety"
2840
)
2841
2842
@property
2843
def is_exists(self) -> bool:
2844
return True
2845
2846
def set_attrs(self):
2847
"""set our object attributes"""
2848
self.attrs.encoding = self.encoding
2849
self.attrs.errors = self.errors
2850
2851
def get_attrs(self):
2852
"""retrieve our attributes"""
2853
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2854
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2855
for n in self.attributes:
2856
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2857
2858
def write(self, obj, **kwargs):
2859
self.set_attrs()
2860
2861
def read_array(self, key: str, start: int | None = None, stop: int | None = None):
2862
"""read an array for the specified node (off of group"""
2863
import tables
2864
2865
node = getattr(self.group, key)
2866
attrs = node._v_attrs
2867
2868
transposed = getattr(attrs, "transposed", False)
2869
2870
if isinstance(node, tables.VLArray):
2871
ret = node[0][start:stop]
2872
else:
2873
dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2874
shape = getattr(attrs, "shape", None)
2875
2876
if shape is not None:
2877
# length 0 axis
2878
ret = np.empty(shape, dtype=dtype)
2879
else:
2880
ret = node[start:stop]
2881
2882
if dtype == "datetime64":
2883
# reconstruct a timezone if indicated
2884
tz = getattr(attrs, "tz", None)
2885
ret = _set_tz(ret, tz, coerce=True)
2886
2887
elif dtype == "timedelta64":
2888
ret = np.asarray(ret, dtype="m8[ns]")
2889
2890
if transposed:
2891
return ret.T
2892
else:
2893
return ret
2894
2895
def read_index(
2896
self, key: str, start: int | None = None, stop: int | None = None
2897
) -> Index:
2898
variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2899
2900
if variety == "multi":
2901
return self.read_multi_index(key, start=start, stop=stop)
2902
elif variety == "regular":
2903
node = getattr(self.group, key)
2904
index = self.read_index_node(node, start=start, stop=stop)
2905
return index
2906
else: # pragma: no cover
2907
raise TypeError(f"unrecognized index variety: {variety}")
2908
2909
def write_index(self, key: str, index: Index):
2910
if isinstance(index, MultiIndex):
2911
setattr(self.attrs, f"{key}_variety", "multi")
2912
self.write_multi_index(key, index)
2913
else:
2914
setattr(self.attrs, f"{key}_variety", "regular")
2915
converted = _convert_index("index", index, self.encoding, self.errors)
2916
2917
self.write_array(key, converted.values)
2918
2919
node = getattr(self.group, key)
2920
node._v_attrs.kind = converted.kind
2921
node._v_attrs.name = index.name
2922
2923
if isinstance(index, (DatetimeIndex, PeriodIndex)):
2924
node._v_attrs.index_class = self._class_to_alias(type(index))
2925
2926
if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
2927
node._v_attrs.freq = index.freq
2928
2929
if isinstance(index, DatetimeIndex) and index.tz is not None:
2930
node._v_attrs.tz = _get_tz(index.tz)
2931
2932
def write_multi_index(self, key: str, index: MultiIndex):
2933
setattr(self.attrs, f"{key}_nlevels", index.nlevels)
2934
2935
for i, (lev, level_codes, name) in enumerate(
2936
zip(index.levels, index.codes, index.names)
2937
):
2938
# write the level
2939
if is_extension_array_dtype(lev):
2940
raise NotImplementedError(
2941
"Saving a MultiIndex with an extension dtype is not supported."
2942
)
2943
level_key = f"{key}_level{i}"
2944
conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
2945
self.write_array(level_key, conv_level.values)
2946
node = getattr(self.group, level_key)
2947
node._v_attrs.kind = conv_level.kind
2948
node._v_attrs.name = name
2949
2950
# write the name
2951
setattr(node._v_attrs, f"{key}_name{name}", name)
2952
2953
# write the labels
2954
label_key = f"{key}_label{i}"
2955
self.write_array(label_key, level_codes)
2956
2957
def read_multi_index(
2958
self, key: str, start: int | None = None, stop: int | None = None
2959
) -> MultiIndex:
2960
nlevels = getattr(self.attrs, f"{key}_nlevels")
2961
2962
levels = []
2963
codes = []
2964
names: list[Hashable] = []
2965
for i in range(nlevels):
2966
level_key = f"{key}_level{i}"
2967
node = getattr(self.group, level_key)
2968
lev = self.read_index_node(node, start=start, stop=stop)
2969
levels.append(lev)
2970
names.append(lev.name)
2971
2972
label_key = f"{key}_label{i}"
2973
level_codes = self.read_array(label_key, start=start, stop=stop)
2974
codes.append(level_codes)
2975
2976
return MultiIndex(
2977
levels=levels, codes=codes, names=names, verify_integrity=True
2978
)
2979
2980
def read_index_node(
2981
self, node: Node, start: int | None = None, stop: int | None = None
2982
) -> Index:
2983
data = node[start:stop]
2984
# If the index was an empty array write_array_empty() will
2985
# have written a sentinel. Here we replace it with the original.
2986
if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
2987
data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
2988
kind = _ensure_decoded(node._v_attrs.kind)
2989
name = None
2990
2991
if "name" in node._v_attrs:
2992
name = _ensure_str(node._v_attrs.name)
2993
name = _ensure_decoded(name)
2994
2995
attrs = node._v_attrs
2996
factory, kwargs = self._get_index_factory(attrs)
2997
2998
if kind == "date":
2999
index = factory(
3000
_unconvert_index(
3001
data, kind, encoding=self.encoding, errors=self.errors
3002
),
3003
dtype=object,
3004
**kwargs,
3005
)
3006
else:
3007
index = factory(
3008
_unconvert_index(
3009
data, kind, encoding=self.encoding, errors=self.errors
3010
),
3011
**kwargs,
3012
)
3013
3014
index.name = name
3015
3016
return index
3017
3018
def write_array_empty(self, key: str, value: ArrayLike):
3019
"""write a 0-len array"""
3020
# ugly hack for length 0 axes
3021
arr = np.empty((1,) * value.ndim)
3022
self._handle.create_array(self.group, key, arr)
3023
node = getattr(self.group, key)
3024
node._v_attrs.value_type = str(value.dtype)
3025
node._v_attrs.shape = value.shape
3026
3027
def write_array(
3028
self, key: str, obj: DataFrame | Series, items: Index | None = None
3029
) -> None:
3030
# TODO: we only have a few tests that get here, the only EA
3031
# that gets passed is DatetimeArray, and we never have
3032
# both self._filters and EA
3033
3034
value = extract_array(obj, extract_numpy=True)
3035
3036
if key in self.group:
3037
self._handle.remove_node(self.group, key)
3038
3039
# Transform needed to interface with pytables row/col notation
3040
empty_array = value.size == 0
3041
transposed = False
3042
3043
if is_categorical_dtype(value.dtype):
3044
raise NotImplementedError(
3045
"Cannot store a category dtype in a HDF5 dataset that uses format="
3046
'"fixed". Use format="table".'
3047
)
3048
if not empty_array:
3049
if hasattr(value, "T"):
3050
# ExtensionArrays (1d) may not have transpose.
3051
value = value.T
3052
transposed = True
3053
3054
atom = None
3055
if self._filters is not None:
3056
with suppress(ValueError):
3057
# get the atom for this datatype
3058
atom = _tables().Atom.from_dtype(value.dtype)
3059
3060
if atom is not None:
3061
# We only get here if self._filters is non-None and
3062
# the Atom.from_dtype call succeeded
3063
3064
# create an empty chunked array and fill it from value
3065
if not empty_array:
3066
ca = self._handle.create_carray(
3067
self.group, key, atom, value.shape, filters=self._filters
3068
)
3069
ca[:] = value
3070
3071
else:
3072
self.write_array_empty(key, value)
3073
3074
elif value.dtype.type == np.object_:
3075
# infer the type, warn if we have a non-string type here (for
3076
# performance)
3077
inferred_type = lib.infer_dtype(value, skipna=False)
3078
if empty_array:
3079
pass
3080
elif inferred_type == "string":
3081
pass
3082
else:
3083
ws = performance_doc % (inferred_type, key, items)
3084
warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3085
3086
vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3087
vlarr.append(value)
3088
3089
elif is_datetime64_dtype(value.dtype):
3090
self._handle.create_array(self.group, key, value.view("i8"))
3091
getattr(self.group, key)._v_attrs.value_type = "datetime64"
3092
elif is_datetime64tz_dtype(value.dtype):
3093
# store as UTC
3094
# with a zone
3095
3096
# error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3097
# attribute "asi8"
3098
self._handle.create_array(
3099
self.group, key, value.asi8 # type: ignore[union-attr]
3100
)
3101
3102
node = getattr(self.group, key)
3103
# error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3104
# attribute "tz"
3105
node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
3106
node._v_attrs.value_type = "datetime64"
3107
elif is_timedelta64_dtype(value.dtype):
3108
self._handle.create_array(self.group, key, value.view("i8"))
3109
getattr(self.group, key)._v_attrs.value_type = "timedelta64"
3110
elif empty_array:
3111
self.write_array_empty(key, value)
3112
else:
3113
self._handle.create_array(self.group, key, value)
3114
3115
getattr(self.group, key)._v_attrs.transposed = transposed
3116
3117
3118
class SeriesFixed(GenericFixed):
3119
pandas_kind = "series"
3120
attributes = ["name"]
3121
3122
name: Hashable
3123
3124
@property
3125
def shape(self):
3126
try:
3127
return (len(self.group.values),)
3128
except (TypeError, AttributeError):
3129
return None
3130
3131
def read(
3132
self,
3133
where=None,
3134
columns=None,
3135
start: int | None = None,
3136
stop: int | None = None,
3137
):
3138
self.validate_read(columns, where)
3139
index = self.read_index("index", start=start, stop=stop)
3140
values = self.read_array("values", start=start, stop=stop)
3141
return Series(values, index=index, name=self.name)
3142
3143
def write(self, obj, **kwargs):
3144
super().write(obj, **kwargs)
3145
self.write_index("index", obj.index)
3146
self.write_array("values", obj)
3147
self.attrs.name = obj.name
3148
3149
3150
class BlockManagerFixed(GenericFixed):
3151
attributes = ["ndim", "nblocks"]
3152
3153
nblocks: int
3154
3155
@property
3156
def shape(self) -> Shape | None:
3157
try:
3158
ndim = self.ndim
3159
3160
# items
3161
items = 0
3162
for i in range(self.nblocks):
3163
node = getattr(self.group, f"block{i}_items")
3164
shape = getattr(node, "shape", None)
3165
if shape is not None:
3166
items += shape[0]
3167
3168
# data shape
3169
node = self.group.block0_values
3170
shape = getattr(node, "shape", None)
3171
if shape is not None:
3172
shape = list(shape[0 : (ndim - 1)])
3173
else:
3174
shape = []
3175
3176
shape.append(items)
3177
3178
return shape
3179
except AttributeError:
3180
return None
3181
3182
def read(
3183
self,
3184
where=None,
3185
columns=None,
3186
start: int | None = None,
3187
stop: int | None = None,
3188
):
3189
# start, stop applied to rows, so 0th axis only
3190
self.validate_read(columns, where)
3191
select_axis = self.obj_type()._get_block_manager_axis(0)
3192
3193
axes = []
3194
for i in range(self.ndim):
3195
3196
_start, _stop = (start, stop) if i == select_axis else (None, None)
3197
ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3198
axes.append(ax)
3199
3200
items = axes[0]
3201
dfs = []
3202
3203
for i in range(self.nblocks):
3204
3205
blk_items = self.read_index(f"block{i}_items")
3206
values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3207
3208
columns = items[items.get_indexer(blk_items)]
3209
df = DataFrame(values.T, columns=columns, index=axes[1])
3210
dfs.append(df)
3211
3212
if len(dfs) > 0:
3213
out = concat(dfs, axis=1)
3214
out = out.reindex(columns=items, copy=False)
3215
return out
3216
3217
return DataFrame(columns=axes[0], index=axes[1])
3218
3219
def write(self, obj, **kwargs):
3220
super().write(obj, **kwargs)
3221
3222
# TODO(ArrayManager) HDFStore relies on accessing the blocks
3223
if isinstance(obj._mgr, ArrayManager):
3224
obj = obj._as_manager("block")
3225
3226
data = obj._mgr
3227
if not data.is_consolidated():
3228
data = data.consolidate()
3229
3230
self.attrs.ndim = data.ndim
3231
for i, ax in enumerate(data.axes):
3232
if i == 0 and (not ax.is_unique):
3233
raise ValueError("Columns index has to be unique for fixed format")
3234
self.write_index(f"axis{i}", ax)
3235
3236
# Supporting mixed-type DataFrame objects...nontrivial
3237
self.attrs.nblocks = len(data.blocks)
3238
for i, blk in enumerate(data.blocks):
3239
# I have no idea why, but writing values before items fixed #2299
3240
blk_items = data.items.take(blk.mgr_locs)
3241
self.write_array(f"block{i}_values", blk.values, items=blk_items)
3242
self.write_index(f"block{i}_items", blk_items)
3243
3244
3245
class FrameFixed(BlockManagerFixed):
3246
pandas_kind = "frame"
3247
obj_type = DataFrame
3248
3249
3250
class Table(Fixed):
3251
"""
3252
represent a table:
3253
facilitate read/write of various types of tables
3254
3255
Attrs in Table Node
3256
-------------------
3257
These are attributes that are store in the main table node, they are
3258
necessary to recreate these tables when read back in.
3259
3260
index_axes : a list of tuples of the (original indexing axis and
3261
index column)
3262
non_index_axes: a list of tuples of the (original index axis and
3263
columns on a non-indexing axis)
3264
values_axes : a list of the columns which comprise the data of this
3265
table
3266
data_columns : a list of the columns that we are allowing indexing
3267
(these become single columns in values_axes)
3268
nan_rep : the string to use for nan representations for string
3269
objects
3270
levels : the names of levels
3271
metadata : the names of the metadata columns
3272
"""
3273
3274
pandas_kind = "wide_table"
3275
format_type: str = "table" # GH#30962 needed by dask
3276
table_type: str
3277
levels: int | list[Hashable] = 1
3278
is_table = True
3279
3280
index_axes: list[IndexCol]
3281
non_index_axes: list[tuple[int, Any]]
3282
values_axes: list[DataCol]
3283
data_columns: list
3284
metadata: list
3285
info: dict
3286
3287
def __init__(
3288
self,
3289
parent: HDFStore,
3290
group: Node,
3291
encoding=None,
3292
errors: str = "strict",
3293
index_axes=None,
3294
non_index_axes=None,
3295
values_axes=None,
3296
data_columns=None,
3297
info=None,
3298
nan_rep=None,
3299
):
3300
super().__init__(parent, group, encoding=encoding, errors=errors)
3301
self.index_axes = index_axes or []
3302
self.non_index_axes = non_index_axes or []
3303
self.values_axes = values_axes or []
3304
self.data_columns = data_columns or []
3305
self.info = info or {}
3306
self.nan_rep = nan_rep
3307
3308
@property
3309
def table_type_short(self) -> str:
3310
return self.table_type.split("_")[0]
3311
3312
def __repr__(self) -> str:
3313
"""return a pretty representation of myself"""
3314
self.infer_axes()
3315
jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3316
dc = f",dc->[{jdc}]"
3317
3318
ver = ""
3319
if self.is_old_version:
3320
jver = ".".join([str(x) for x in self.version])
3321
ver = f"[{jver}]"
3322
3323
jindex_axes = ",".join([a.name for a in self.index_axes])
3324
return (
3325
f"{self.pandas_type:12.12}{ver} "
3326
f"(typ->{self.table_type_short},nrows->{self.nrows},"
3327
f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3328
)
3329
3330
def __getitem__(self, c: str):
3331
"""return the axis for c"""
3332
for a in self.axes:
3333
if c == a.name:
3334
return a
3335
return None
3336
3337
def validate(self, other):
3338
"""validate against an existing table"""
3339
if other is None:
3340
return
3341
3342
if other.table_type != self.table_type:
3343
raise TypeError(
3344
"incompatible table_type with existing "
3345
f"[{other.table_type} - {self.table_type}]"
3346
)
3347
3348
for c in ["index_axes", "non_index_axes", "values_axes"]:
3349
sv = getattr(self, c, None)
3350
ov = getattr(other, c, None)
3351
if sv != ov:
3352
3353
# show the error for the specific axes
3354
# Argument 1 to "enumerate" has incompatible type
3355
# "Optional[Any]"; expected "Iterable[Any]" [arg-type]
3356
for i, sax in enumerate(sv): # type: ignore[arg-type]
3357
# Value of type "Optional[Any]" is not indexable [index]
3358
oax = ov[i] # type: ignore[index]
3359
if sax != oax:
3360
raise ValueError(
3361
f"invalid combination of [{c}] on appending data "
3362
f"[{sax}] vs current table [{oax}]"
3363
)
3364
3365
# should never get here
3366
raise Exception(
3367
f"invalid combination of [{c}] on appending data [{sv}] vs "
3368
f"current table [{ov}]"
3369
)
3370
3371
@property
3372
def is_multi_index(self) -> bool:
3373
"""the levels attribute is 1 or a list in the case of a multi-index"""
3374
return isinstance(self.levels, list)
3375
3376
def validate_multiindex(
3377
self, obj: DataFrame | Series
3378
) -> tuple[DataFrame, list[Hashable]]:
3379
"""
3380
validate that we can store the multi-index; reset and return the
3381
new object
3382
"""
3383
levels = com.fill_missing_names(obj.index.names)
3384
try:
3385
reset_obj = obj.reset_index()
3386
except ValueError as err:
3387
raise ValueError(
3388
"duplicate names/columns in the multi-index when storing as a table"
3389
) from err
3390
assert isinstance(reset_obj, DataFrame) # for mypy
3391
return reset_obj, levels
3392
3393
@property
3394
def nrows_expected(self) -> int:
3395
"""based on our axes, compute the expected nrows"""
3396
return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3397
3398
@property
3399
def is_exists(self) -> bool:
3400
"""has this table been created"""
3401
return "table" in self.group
3402
3403
@property
3404
def storable(self):
3405
return getattr(self.group, "table", None)
3406
3407
@property
3408
def table(self):
3409
"""return the table group (this is my storable)"""
3410
return self.storable
3411
3412
@property
3413
def dtype(self):
3414
return self.table.dtype
3415
3416
@property
3417
def description(self):
3418
return self.table.description
3419
3420
@property
3421
def axes(self):
3422
return itertools.chain(self.index_axes, self.values_axes)
3423
3424
@property
3425
def ncols(self) -> int:
3426
"""the number of total columns in the values axes"""
3427
return sum(len(a.values) for a in self.values_axes)
3428
3429
@property
3430
def is_transposed(self) -> bool:
3431
return False
3432
3433
@property
3434
def data_orientation(self):
3435
"""return a tuple of my permutated axes, non_indexable at the front"""
3436
return tuple(
3437
itertools.chain(
3438
[int(a[0]) for a in self.non_index_axes],
3439
[int(a.axis) for a in self.index_axes],
3440
)
3441
)
3442
3443
def queryables(self) -> dict[str, Any]:
3444
"""return a dict of the kinds allowable columns for this object"""
3445
# mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3446
axis_names = {0: "index", 1: "columns"}
3447
3448
# compute the values_axes queryables
3449
d1 = [(a.cname, a) for a in self.index_axes]
3450
d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3451
d3 = [
3452
(v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3453
]
3454
3455
# error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" and
3456
# "List[Tuple[str, None]]")
3457
return dict(d1 + d2 + d3) # type: ignore[operator]
3458
3459
def index_cols(self):
3460
"""return a list of my index cols"""
3461
# Note: each `i.cname` below is assured to be a str.
3462
return [(i.axis, i.cname) for i in self.index_axes]
3463
3464
def values_cols(self) -> list[str]:
3465
"""return a list of my values cols"""
3466
return [i.cname for i in self.values_axes]
3467
3468
def _get_metadata_path(self, key: str) -> str:
3469
"""return the metadata pathname for this key"""
3470
group = self.group._v_pathname
3471
return f"{group}/meta/{key}/meta"
3472
3473
def write_metadata(self, key: str, values: np.ndarray):
3474
"""
3475
Write out a metadata array to the key as a fixed-format Series.
3476
3477
Parameters
3478
----------
3479
key : str
3480
values : ndarray
3481
"""
3482
self.parent.put(
3483
self._get_metadata_path(key),
3484
Series(values),
3485
format="table",
3486
encoding=self.encoding,
3487
errors=self.errors,
3488
nan_rep=self.nan_rep,
3489
)
3490
3491
def read_metadata(self, key: str):
3492
"""return the meta data array for this key"""
3493
if getattr(getattr(self.group, "meta", None), key, None) is not None:
3494
return self.parent.select(self._get_metadata_path(key))
3495
return None
3496
3497
def set_attrs(self):
3498
"""set our table type & indexables"""
3499
self.attrs.table_type = str(self.table_type)
3500
self.attrs.index_cols = self.index_cols()
3501
self.attrs.values_cols = self.values_cols()
3502
self.attrs.non_index_axes = self.non_index_axes
3503
self.attrs.data_columns = self.data_columns
3504
self.attrs.nan_rep = self.nan_rep
3505
self.attrs.encoding = self.encoding
3506
self.attrs.errors = self.errors
3507
self.attrs.levels = self.levels
3508
self.attrs.info = self.info
3509
3510
def get_attrs(self):
3511
"""retrieve our attributes"""
3512
self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3513
self.data_columns = getattr(self.attrs, "data_columns", None) or []
3514
self.info = getattr(self.attrs, "info", None) or {}
3515
self.nan_rep = getattr(self.attrs, "nan_rep", None)
3516
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3517
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3518
self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
3519
self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3520
self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3521
3522
def validate_version(self, where=None):
3523
"""are we trying to operate on an old version?"""
3524
if where is not None:
3525
if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
3526
ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3527
warnings.warn(ws, IncompatibilityWarning)
3528
3529
def validate_min_itemsize(self, min_itemsize):
3530
"""
3531
validate the min_itemsize doesn't contain items that are not in the
3532
axes this needs data_columns to be defined
3533
"""
3534
if min_itemsize is None:
3535
return
3536
if not isinstance(min_itemsize, dict):
3537
return
3538
3539
q = self.queryables()
3540
for k in min_itemsize:
3541
3542
# ok, apply generally
3543
if k == "values":
3544
continue
3545
if k not in q:
3546
raise ValueError(
3547
f"min_itemsize has the key [{k}] which is not an axis or "
3548
"data_column"
3549
)
3550
3551
@cache_readonly
3552
def indexables(self):
3553
"""create/cache the indexables if they don't exist"""
3554
_indexables = []
3555
3556
desc = self.description
3557
table_attrs = self.table.attrs
3558
3559
# Note: each of the `name` kwargs below are str, ensured
3560
# by the definition in index_cols.
3561
# index columns
3562
for i, (axis, name) in enumerate(self.attrs.index_cols):
3563
atom = getattr(desc, name)
3564
md = self.read_metadata(name)
3565
meta = "category" if md is not None else None
3566
3567
kind_attr = f"{name}_kind"
3568
kind = getattr(table_attrs, kind_attr, None)
3569
3570
index_col = IndexCol(
3571
name=name,
3572
axis=axis,
3573
pos=i,
3574
kind=kind,
3575
typ=atom,
3576
table=self.table,
3577
meta=meta,
3578
metadata=md,
3579
)
3580
_indexables.append(index_col)
3581
3582
# values columns
3583
dc = set(self.data_columns)
3584
base_pos = len(_indexables)
3585
3586
def f(i, c):
3587
assert isinstance(c, str)
3588
klass = DataCol
3589
if c in dc:
3590
klass = DataIndexableCol
3591
3592
atom = getattr(desc, c)
3593
adj_name = _maybe_adjust_name(c, self.version)
3594
3595
# TODO: why kind_attr here?
3596
values = getattr(table_attrs, f"{adj_name}_kind", None)
3597
dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3598
# Argument 1 to "_dtype_to_kind" has incompatible type
3599
# "Optional[Any]"; expected "str" [arg-type]
3600
kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
3601
3602
md = self.read_metadata(c)
3603
# TODO: figure out why these two versions of `meta` dont always match.
3604
# meta = "category" if md is not None else None
3605
meta = getattr(table_attrs, f"{adj_name}_meta", None)
3606
3607
obj = klass(
3608
name=adj_name,
3609
cname=c,
3610
values=values,
3611
kind=kind,
3612
pos=base_pos + i,
3613
typ=atom,
3614
table=self.table,
3615
meta=meta,
3616
metadata=md,
3617
dtype=dtype,
3618
)
3619
return obj
3620
3621
# Note: the definition of `values_cols` ensures that each
3622
# `c` below is a str.
3623
_indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3624
3625
return _indexables
3626
3627
def create_index(self, columns=None, optlevel=None, kind: str | None = None):
3628
"""
3629
Create a pytables index on the specified columns.
3630
3631
Parameters
3632
----------
3633
columns : None, bool, or listlike[str]
3634
Indicate which columns to create an index on.
3635
3636
* False : Do not create any indexes.
3637
* True : Create indexes on all columns.
3638
* None : Create indexes on all columns.
3639
* listlike : Create indexes on the given columns.
3640
3641
optlevel : int or None, default None
3642
Optimization level, if None, pytables defaults to 6.
3643
kind : str or None, default None
3644
Kind of index, if None, pytables defaults to "medium".
3645
3646
Raises
3647
------
3648
TypeError if trying to create an index on a complex-type column.
3649
3650
Notes
3651
-----
3652
Cannot index Time64Col or ComplexCol.
3653
Pytables must be >= 3.0.
3654
"""
3655
if not self.infer_axes():
3656
return
3657
if columns is False:
3658
return
3659
3660
# index all indexables and data_columns
3661
if columns is None or columns is True:
3662
columns = [a.cname for a in self.axes if a.is_data_indexable]
3663
if not isinstance(columns, (tuple, list)):
3664
columns = [columns]
3665
3666
kw = {}
3667
if optlevel is not None:
3668
kw["optlevel"] = optlevel
3669
if kind is not None:
3670
kw["kind"] = kind
3671
3672
table = self.table
3673
for c in columns:
3674
v = getattr(table.cols, c, None)
3675
if v is not None:
3676
# remove the index if the kind/optlevel have changed
3677
if v.is_indexed:
3678
index = v.index
3679
cur_optlevel = index.optlevel
3680
cur_kind = index.kind
3681
3682
if kind is not None and cur_kind != kind:
3683
v.remove_index()
3684
else:
3685
kw["kind"] = cur_kind
3686
3687
if optlevel is not None and cur_optlevel != optlevel:
3688
v.remove_index()
3689
else:
3690
kw["optlevel"] = cur_optlevel
3691
3692
# create the index
3693
if not v.is_indexed:
3694
if v.type.startswith("complex"):
3695
raise TypeError(
3696
"Columns containing complex values can be stored but "
3697
"cannot be indexed when using table format. Either use "
3698
"fixed format, set index=False, or do not include "
3699
"the columns containing complex values to "
3700
"data_columns when initializing the table."
3701
)
3702
v.create_index(**kw)
3703
elif c in self.non_index_axes[0][1]:
3704
# GH 28156
3705
raise AttributeError(
3706
f"column {c} is not a data_column.\n"
3707
f"In order to read column {c} you must reload the dataframe \n"
3708
f"into HDFStore and include {c} with the data_columns argument."
3709
)
3710
3711
def _read_axes(
3712
self, where, start: int | None = None, stop: int | None = None
3713
) -> list[tuple[ArrayLike, ArrayLike]]:
3714
"""
3715
Create the axes sniffed from the table.
3716
3717
Parameters
3718
----------
3719
where : ???
3720
start : int or None, default None
3721
stop : int or None, default None
3722
3723
Returns
3724
-------
3725
List[Tuple[index_values, column_values]]
3726
"""
3727
# create the selection
3728
selection = Selection(self, where=where, start=start, stop=stop)
3729
values = selection.select()
3730
3731
results = []
3732
# convert the data
3733
for a in self.axes:
3734
a.set_info(self.info)
3735
res = a.convert(
3736
values,
3737
nan_rep=self.nan_rep,
3738
encoding=self.encoding,
3739
errors=self.errors,
3740
)
3741
results.append(res)
3742
3743
return results
3744
3745
@classmethod
3746
def get_object(cls, obj, transposed: bool):
3747
"""return the data for this obj"""
3748
return obj
3749
3750
def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3751
"""
3752
take the input data_columns and min_itemize and create a data
3753
columns spec
3754
"""
3755
if not len(non_index_axes):
3756
return []
3757
3758
axis, axis_labels = non_index_axes[0]
3759
info = self.info.get(axis, {})
3760
if info.get("type") == "MultiIndex" and data_columns:
3761
raise ValueError(
3762
f"cannot use a multi-index on axis [{axis}] with "
3763
f"data_columns {data_columns}"
3764
)
3765
3766
# evaluate the passed data_columns, True == use all columns
3767
# take only valid axis labels
3768
if data_columns is True:
3769
data_columns = list(axis_labels)
3770
elif data_columns is None:
3771
data_columns = []
3772
3773
# if min_itemsize is a dict, add the keys (exclude 'values')
3774
if isinstance(min_itemsize, dict):
3775
existing_data_columns = set(data_columns)
3776
data_columns = list(data_columns) # ensure we do not modify
3777
data_columns.extend(
3778
[
3779
k
3780
for k in min_itemsize.keys()
3781
if k != "values" and k not in existing_data_columns
3782
]
3783
)
3784
3785
# return valid columns in the order of our axis
3786
return [c for c in data_columns if c in axis_labels]
3787
3788
def _create_axes(
3789
self,
3790
axes,
3791
obj: DataFrame,
3792
validate: bool = True,
3793
nan_rep=None,
3794
data_columns=None,
3795
min_itemsize=None,
3796
):
3797
"""
3798
Create and return the axes.
3799
3800
Parameters
3801
----------
3802
axes: list or None
3803
The names or numbers of the axes to create.
3804
obj : DataFrame
3805
The object to create axes on.
3806
validate: bool, default True
3807
Whether to validate the obj against an existing object already written.
3808
nan_rep :
3809
A value to use for string column nan_rep.
3810
data_columns : List[str], True, or None, default None
3811
Specify the columns that we want to create to allow indexing on.
3812
3813
* True : Use all available columns.
3814
* None : Use no columns.
3815
* List[str] : Use the specified columns.
3816
3817
min_itemsize: Dict[str, int] or None, default None
3818
The min itemsize for a column in bytes.
3819
"""
3820
if not isinstance(obj, DataFrame):
3821
group = self.group._v_name
3822
raise TypeError(
3823
f"cannot properly create the storer for: [group->{group},"
3824
f"value->{type(obj)}]"
3825
)
3826
3827
# set the default axes if needed
3828
if axes is None:
3829
axes = [0]
3830
3831
# map axes to numbers
3832
axes = [obj._get_axis_number(a) for a in axes]
3833
3834
# do we have an existing table (if so, use its axes & data_columns)
3835
if self.infer_axes():
3836
table_exists = True
3837
axes = [a.axis for a in self.index_axes]
3838
data_columns = list(self.data_columns)
3839
nan_rep = self.nan_rep
3840
# TODO: do we always have validate=True here?
3841
else:
3842
table_exists = False
3843
3844
new_info = self.info
3845
3846
assert self.ndim == 2 # with next check, we must have len(axes) == 1
3847
# currently support on ndim-1 axes
3848
if len(axes) != self.ndim - 1:
3849
raise ValueError(
3850
"currently only support ndim-1 indexers in an AppendableTable"
3851
)
3852
3853
# create according to the new data
3854
new_non_index_axes: list = []
3855
3856
# nan_representation
3857
if nan_rep is None:
3858
nan_rep = "nan"
3859
3860
# We construct the non-index-axis first, since that alters new_info
3861
idx = [x for x in [0, 1] if x not in axes][0]
3862
3863
a = obj.axes[idx]
3864
# we might be able to change the axes on the appending data if necessary
3865
append_axis = list(a)
3866
if table_exists:
3867
indexer = len(new_non_index_axes) # i.e. 0
3868
exist_axis = self.non_index_axes[indexer][1]
3869
if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
3870
3871
# ahah! -> reindex
3872
if array_equivalent(
3873
np.array(sorted(append_axis)), np.array(sorted(exist_axis))
3874
):
3875
append_axis = exist_axis
3876
3877
# the non_index_axes info
3878
info = new_info.setdefault(idx, {})
3879
info["names"] = list(a.names)
3880
info["type"] = type(a).__name__
3881
3882
new_non_index_axes.append((idx, append_axis))
3883
3884
# Now we can construct our new index axis
3885
idx = axes[0]
3886
a = obj.axes[idx]
3887
axis_name = obj._get_axis_name(idx)
3888
new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3889
new_index.axis = idx
3890
3891
# Because we are always 2D, there is only one new_index, so
3892
# we know it will have pos=0
3893
new_index.set_pos(0)
3894
new_index.update_info(new_info)
3895
new_index.maybe_set_size(min_itemsize) # check for column conflicts
3896
3897
new_index_axes = [new_index]
3898
j = len(new_index_axes) # i.e. 1
3899
assert j == 1
3900
3901
# reindex by our non_index_axes & compute data_columns
3902
assert len(new_non_index_axes) == 1
3903
for a in new_non_index_axes:
3904
obj = _reindex_axis(obj, a[0], a[1])
3905
3906
transposed = new_index.axis == 1
3907
3908
# figure out data_columns and get out blocks
3909
data_columns = self.validate_data_columns(
3910
data_columns, min_itemsize, new_non_index_axes
3911
)
3912
3913
frame = self.get_object(obj, transposed)._consolidate()
3914
3915
blocks, blk_items = self._get_blocks_and_items(
3916
frame, table_exists, new_non_index_axes, self.values_axes, data_columns
3917
)
3918
3919
# add my values
3920
vaxes = []
3921
for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
3922
3923
# shape of the data column are the indexable axes
3924
klass = DataCol
3925
name = None
3926
3927
# we have a data_column
3928
if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
3929
klass = DataIndexableCol
3930
name = b_items[0]
3931
if not (name is None or isinstance(name, str)):
3932
# TODO: should the message here be more specifically non-str?
3933
raise ValueError("cannot have non-object label DataIndexableCol")
3934
3935
# make sure that we match up the existing columns
3936
# if we have an existing table
3937
existing_col: DataCol | None
3938
3939
if table_exists and validate:
3940
try:
3941
existing_col = self.values_axes[i]
3942
except (IndexError, KeyError) as err:
3943
raise ValueError(
3944
f"Incompatible appended table [{blocks}]"
3945
f"with existing table [{self.values_axes}]"
3946
) from err
3947
else:
3948
existing_col = None
3949
3950
new_name = name or f"values_block_{i}"
3951
data_converted = _maybe_convert_for_string_atom(
3952
new_name,
3953
blk.values,
3954
existing_col=existing_col,
3955
min_itemsize=min_itemsize,
3956
nan_rep=nan_rep,
3957
encoding=self.encoding,
3958
errors=self.errors,
3959
columns=b_items,
3960
)
3961
adj_name = _maybe_adjust_name(new_name, self.version)
3962
3963
typ = klass._get_atom(data_converted)
3964
kind = _dtype_to_kind(data_converted.dtype.name)
3965
tz = None
3966
if getattr(data_converted, "tz", None) is not None:
3967
tz = _get_tz(data_converted.tz)
3968
3969
meta = metadata = ordered = None
3970
if is_categorical_dtype(data_converted.dtype):
3971
ordered = data_converted.ordered
3972
meta = "category"
3973
metadata = np.array(data_converted.categories, copy=False).ravel()
3974
3975
data, dtype_name = _get_data_and_dtype_name(data_converted)
3976
3977
col = klass(
3978
name=adj_name,
3979
cname=new_name,
3980
values=list(b_items),
3981
typ=typ,
3982
pos=j,
3983
kind=kind,
3984
tz=tz,
3985
ordered=ordered,
3986
meta=meta,
3987
metadata=metadata,
3988
dtype=dtype_name,
3989
data=data,
3990
)
3991
col.update_info(new_info)
3992
3993
vaxes.append(col)
3994
3995
j += 1
3996
3997
dcs = [col.name for col in vaxes if col.is_data_indexable]
3998
3999
new_table = type(self)(
4000
parent=self.parent,
4001
group=self.group,
4002
encoding=self.encoding,
4003
errors=self.errors,
4004
index_axes=new_index_axes,
4005
non_index_axes=new_non_index_axes,
4006
values_axes=vaxes,
4007
data_columns=dcs,
4008
info=new_info,
4009
nan_rep=nan_rep,
4010
)
4011
if hasattr(self, "levels"):
4012
# TODO: get this into constructor, only for appropriate subclass
4013
new_table.levels = self.levels
4014
4015
new_table.validate_min_itemsize(min_itemsize)
4016
4017
if validate and table_exists:
4018
new_table.validate(self)
4019
4020
return new_table
4021
4022
@staticmethod
4023
def _get_blocks_and_items(
4024
frame: DataFrame,
4025
table_exists: bool,
4026
new_non_index_axes,
4027
values_axes,
4028
data_columns,
4029
):
4030
# Helper to clarify non-state-altering parts of _create_axes
4031
4032
# TODO(ArrayManager) HDFStore relies on accessing the blocks
4033
if isinstance(frame._mgr, ArrayManager):
4034
frame = frame._as_manager("block")
4035
4036
def get_blk_items(mgr):
4037
return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
4038
4039
mgr = frame._mgr
4040
mgr = cast(BlockManager, mgr)
4041
blocks: list[Block] = list(mgr.blocks)
4042
blk_items: list[Index] = get_blk_items(mgr)
4043
4044
if len(data_columns):
4045
axis, axis_labels = new_non_index_axes[0]
4046
new_labels = Index(axis_labels).difference(Index(data_columns))
4047
mgr = frame.reindex(new_labels, axis=axis)._mgr
4048
4049
# error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no
4050
# attribute "blocks"
4051
blocks = list(mgr.blocks) # type: ignore[union-attr]
4052
blk_items = get_blk_items(mgr)
4053
for c in data_columns:
4054
mgr = frame.reindex([c], axis=axis)._mgr
4055
# error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has
4056
# no attribute "blocks"
4057
blocks.extend(mgr.blocks) # type: ignore[union-attr]
4058
blk_items.extend(get_blk_items(mgr))
4059
4060
# reorder the blocks in the same order as the existing table if we can
4061
if table_exists:
4062
by_items = {
4063
tuple(b_items.tolist()): (b, b_items)
4064
for b, b_items in zip(blocks, blk_items)
4065
}
4066
new_blocks: list[Block] = []
4067
new_blk_items = []
4068
for ea in values_axes:
4069
items = tuple(ea.values)
4070
try:
4071
b, b_items = by_items.pop(items)
4072
new_blocks.append(b)
4073
new_blk_items.append(b_items)
4074
except (IndexError, KeyError) as err:
4075
jitems = ",".join([pprint_thing(item) for item in items])
4076
raise ValueError(
4077
f"cannot match existing table structure for [{jitems}] "
4078
"on appending data"
4079
) from err
4080
blocks = new_blocks
4081
blk_items = new_blk_items
4082
4083
return blocks, blk_items
4084
4085
def process_axes(self, obj, selection: Selection, columns=None):
4086
"""process axes filters"""
4087
# make a copy to avoid side effects
4088
if columns is not None:
4089
columns = list(columns)
4090
4091
# make sure to include levels if we have them
4092
if columns is not None and self.is_multi_index:
4093
assert isinstance(self.levels, list) # assured by is_multi_index
4094
for n in self.levels:
4095
if n not in columns:
4096
columns.insert(0, n)
4097
4098
# reorder by any non_index_axes & limit to the select columns
4099
for axis, labels in self.non_index_axes:
4100
obj = _reindex_axis(obj, axis, labels, columns)
4101
4102
# apply the selection filters (but keep in the same order)
4103
if selection.filter is not None:
4104
for field, op, filt in selection.filter.format():
4105
4106
def process_filter(field, filt):
4107
4108
for axis_name in obj._AXIS_ORDERS:
4109
axis_number = obj._get_axis_number(axis_name)
4110
axis_values = obj._get_axis(axis_name)
4111
assert axis_number is not None
4112
4113
# see if the field is the name of an axis
4114
if field == axis_name:
4115
4116
# if we have a multi-index, then need to include
4117
# the levels
4118
if self.is_multi_index:
4119
filt = filt.union(Index(self.levels))
4120
4121
takers = op(axis_values, filt)
4122
return obj.loc(axis=axis_number)[takers]
4123
4124
# this might be the name of a file IN an axis
4125
elif field in axis_values:
4126
4127
# we need to filter on this dimension
4128
values = ensure_index(getattr(obj, field).values)
4129
filt = ensure_index(filt)
4130
4131
# hack until we support reversed dim flags
4132
if isinstance(obj, DataFrame):
4133
axis_number = 1 - axis_number
4134
takers = op(values, filt)
4135
return obj.loc(axis=axis_number)[takers]
4136
4137
raise ValueError(f"cannot find the field [{field}] for filtering!")
4138
4139
obj = process_filter(field, filt)
4140
4141
return obj
4142
4143
def create_description(
4144
self,
4145
complib,
4146
complevel: int | None,
4147
fletcher32: bool,
4148
expectedrows: int | None,
4149
) -> dict[str, Any]:
4150
"""create the description of the table from the axes & values"""
4151
# provided expected rows if its passed
4152
if expectedrows is None:
4153
expectedrows = max(self.nrows_expected, 10000)
4154
4155
d = {"name": "table", "expectedrows": expectedrows}
4156
4157
# description from the axes & values
4158
d["description"] = {a.cname: a.typ for a in self.axes}
4159
4160
if complib:
4161
if complevel is None:
4162
complevel = self._complevel or 9
4163
filters = _tables().Filters(
4164
complevel=complevel,
4165
complib=complib,
4166
fletcher32=fletcher32 or self._fletcher32,
4167
)
4168
d["filters"] = filters
4169
elif self._filters is not None:
4170
d["filters"] = self._filters
4171
4172
return d
4173
4174
def read_coordinates(
4175
self, where=None, start: int | None = None, stop: int | None = None
4176
):
4177
"""
4178
select coordinates (row numbers) from a table; return the
4179
coordinates object
4180
"""
4181
# validate the version
4182
self.validate_version(where)
4183
4184
# infer the data kind
4185
if not self.infer_axes():
4186
return False
4187
4188
# create the selection
4189
selection = Selection(self, where=where, start=start, stop=stop)
4190
coords = selection.select_coords()
4191
if selection.filter is not None:
4192
for field, op, filt in selection.filter.format():
4193
data = self.read_column(
4194
field, start=coords.min(), stop=coords.max() + 1
4195
)
4196
coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4197
4198
return Index(coords)
4199
4200
def read_column(
4201
self,
4202
column: str,
4203
where=None,
4204
start: int | None = None,
4205
stop: int | None = None,
4206
):
4207
"""
4208
return a single column from the table, generally only indexables
4209
are interesting
4210
"""
4211
# validate the version
4212
self.validate_version()
4213
4214
# infer the data kind
4215
if not self.infer_axes():
4216
return False
4217
4218
if where is not None:
4219
raise TypeError("read_column does not currently accept a where clause")
4220
4221
# find the axes
4222
for a in self.axes:
4223
if column == a.name:
4224
if not a.is_data_indexable:
4225
raise ValueError(
4226
f"column [{column}] can not be extracted individually; "
4227
"it is not data indexable"
4228
)
4229
4230
# column must be an indexable or a data column
4231
c = getattr(self.table.cols, column)
4232
a.set_info(self.info)
4233
col_values = a.convert(
4234
c[start:stop],
4235
nan_rep=self.nan_rep,
4236
encoding=self.encoding,
4237
errors=self.errors,
4238
)
4239
return Series(_set_tz(col_values[1], a.tz), name=column)
4240
4241
raise KeyError(f"column [{column}] not found in the table")
4242
4243
4244
class WORMTable(Table):
4245
"""
4246
a write-once read-many table: this format DOES NOT ALLOW appending to a
4247
table. writing is a one-time operation the data are stored in a format
4248
that allows for searching the data on disk
4249
"""
4250
4251
table_type = "worm"
4252
4253
def read(
4254
self,
4255
where=None,
4256
columns=None,
4257
start: int | None = None,
4258
stop: int | None = None,
4259
):
4260
"""
4261
read the indices and the indexing array, calculate offset rows and return
4262
"""
4263
raise NotImplementedError("WORMTable needs to implement read")
4264
4265
def write(self, **kwargs):
4266
"""
4267
write in a format that we can search later on (but cannot append
4268
to): write out the indices and the values using _write_array
4269
(e.g. a CArray) create an indexing table so that we can search
4270
"""
4271
raise NotImplementedError("WORMTable needs to implement write")
4272
4273
4274
class AppendableTable(Table):
4275
"""support the new appendable table formats"""
4276
4277
table_type = "appendable"
4278
4279
def write(
4280
self,
4281
obj,
4282
axes=None,
4283
append=False,
4284
complib=None,
4285
complevel=None,
4286
fletcher32=None,
4287
min_itemsize=None,
4288
chunksize=None,
4289
expectedrows=None,
4290
dropna=False,
4291
nan_rep=None,
4292
data_columns=None,
4293
track_times=True,
4294
):
4295
if not append and self.is_exists:
4296
self._handle.remove_node(self.group, "table")
4297
4298
# create the axes
4299
table = self._create_axes(
4300
axes=axes,
4301
obj=obj,
4302
validate=append,
4303
min_itemsize=min_itemsize,
4304
nan_rep=nan_rep,
4305
data_columns=data_columns,
4306
)
4307
4308
for a in table.axes:
4309
a.validate_names()
4310
4311
if not table.is_exists:
4312
4313
# create the table
4314
options = table.create_description(
4315
complib=complib,
4316
complevel=complevel,
4317
fletcher32=fletcher32,
4318
expectedrows=expectedrows,
4319
)
4320
4321
# set the table attributes
4322
table.set_attrs()
4323
4324
options["track_times"] = track_times
4325
4326
# create the table
4327
table._handle.create_table(table.group, **options)
4328
4329
# update my info
4330
table.attrs.info = table.info
4331
4332
# validate the axes and set the kinds
4333
for a in table.axes:
4334
a.validate_and_set(table, append)
4335
4336
# add the rows
4337
table.write_data(chunksize, dropna=dropna)
4338
4339
def write_data(self, chunksize: int | None, dropna: bool = False):
4340
"""
4341
we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
4342
"""
4343
names = self.dtype.names
4344
nrows = self.nrows_expected
4345
4346
# if dropna==True, then drop ALL nan rows
4347
masks = []
4348
if dropna:
4349
for a in self.values_axes:
4350
# figure the mask: only do if we can successfully process this
4351
# column, otherwise ignore the mask
4352
mask = isna(a.data).all(axis=0)
4353
if isinstance(mask, np.ndarray):
4354
masks.append(mask.astype("u1", copy=False))
4355
4356
# consolidate masks
4357
if len(masks):
4358
mask = masks[0]
4359
for m in masks[1:]:
4360
mask = mask & m
4361
mask = mask.ravel()
4362
else:
4363
mask = None
4364
4365
# broadcast the indexes if needed
4366
indexes = [a.cvalues for a in self.index_axes]
4367
nindexes = len(indexes)
4368
assert nindexes == 1, nindexes # ensures we dont need to broadcast
4369
4370
# transpose the values so first dimension is last
4371
# reshape the values if needed
4372
values = [a.take_data() for a in self.values_axes]
4373
values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4374
bvalues = []
4375
for i, v in enumerate(values):
4376
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4377
bvalues.append(values[i].reshape(new_shape))
4378
4379
# write the chunks
4380
if chunksize is None:
4381
chunksize = 100000
4382
4383
rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4384
chunks = nrows // chunksize + 1
4385
for i in range(chunks):
4386
start_i = i * chunksize
4387
end_i = min((i + 1) * chunksize, nrows)
4388
if start_i >= end_i:
4389
break
4390
4391
self.write_data_chunk(
4392
rows,
4393
indexes=[a[start_i:end_i] for a in indexes],
4394
mask=mask[start_i:end_i] if mask is not None else None,
4395
values=[v[start_i:end_i] for v in bvalues],
4396
)
4397
4398
def write_data_chunk(
4399
self,
4400
rows: np.ndarray,
4401
indexes: list[np.ndarray],
4402
mask: np.ndarray | None,
4403
values: list[np.ndarray],
4404
):
4405
"""
4406
Parameters
4407
----------
4408
rows : an empty memory space where we are putting the chunk
4409
indexes : an array of the indexes
4410
mask : an array of the masks
4411
values : an array of the values
4412
"""
4413
# 0 len
4414
for v in values:
4415
if not np.prod(v.shape):
4416
return
4417
4418
nrows = indexes[0].shape[0]
4419
if nrows != len(rows):
4420
rows = np.empty(nrows, dtype=self.dtype)
4421
names = self.dtype.names
4422
nindexes = len(indexes)
4423
4424
# indexes
4425
for i, idx in enumerate(indexes):
4426
rows[names[i]] = idx
4427
4428
# values
4429
for i, v in enumerate(values):
4430
rows[names[i + nindexes]] = v
4431
4432
# mask
4433
if mask is not None:
4434
m = ~mask.ravel().astype(bool, copy=False)
4435
if not m.all():
4436
rows = rows[m]
4437
4438
if len(rows):
4439
self.table.append(rows)
4440
self.table.flush()
4441
4442
def delete(self, where=None, start: int | None = None, stop: int | None = None):
4443
4444
# delete all rows (and return the nrows)
4445
if where is None or not len(where):
4446
if start is None and stop is None:
4447
nrows = self.nrows
4448
self._handle.remove_node(self.group, recursive=True)
4449
else:
4450
# pytables<3.0 would remove a single row with stop=None
4451
if stop is None:
4452
stop = self.nrows
4453
nrows = self.table.remove_rows(start=start, stop=stop)
4454
self.table.flush()
4455
return nrows
4456
4457
# infer the data kind
4458
if not self.infer_axes():
4459
return None
4460
4461
# create the selection
4462
table = self.table
4463
selection = Selection(self, where, start=start, stop=stop)
4464
values = selection.select_coords()
4465
4466
# delete the rows in reverse order
4467
sorted_series = Series(values).sort_values()
4468
ln = len(sorted_series)
4469
4470
if ln:
4471
4472
# construct groups of consecutive rows
4473
diff = sorted_series.diff()
4474
groups = list(diff[diff > 1].index)
4475
4476
# 1 group
4477
if not len(groups):
4478
groups = [0]
4479
4480
# final element
4481
if groups[-1] != ln:
4482
groups.append(ln)
4483
4484
# initial element
4485
if groups[0] != 0:
4486
groups.insert(0, 0)
4487
4488
# we must remove in reverse order!
4489
pg = groups.pop()
4490
for g in reversed(groups):
4491
rows = sorted_series.take(range(g, pg))
4492
table.remove_rows(
4493
start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4494
)
4495
pg = g
4496
4497
self.table.flush()
4498
4499
# return the number of rows removed
4500
return ln
4501
4502
4503
class AppendableFrameTable(AppendableTable):
4504
"""support the new appendable table formats"""
4505
4506
pandas_kind = "frame_table"
4507
table_type = "appendable_frame"
4508
ndim = 2
4509
obj_type: type[DataFrame | Series] = DataFrame
4510
4511
@property
4512
def is_transposed(self) -> bool:
4513
return self.index_axes[0].axis == 1
4514
4515
@classmethod
4516
def get_object(cls, obj, transposed: bool):
4517
"""these are written transposed"""
4518
if transposed:
4519
obj = obj.T
4520
return obj
4521
4522
def read(
4523
self,
4524
where=None,
4525
columns=None,
4526
start: int | None = None,
4527
stop: int | None = None,
4528
):
4529
4530
# validate the version
4531
self.validate_version(where)
4532
4533
# infer the data kind
4534
if not self.infer_axes():
4535
return None
4536
4537
result = self._read_axes(where=where, start=start, stop=stop)
4538
4539
info = (
4540
self.info.get(self.non_index_axes[0][0], {})
4541
if len(self.non_index_axes)
4542
else {}
4543
)
4544
4545
inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4546
assert len(inds) == 1
4547
ind = inds[0]
4548
4549
index = result[ind][0]
4550
4551
frames = []
4552
for i, a in enumerate(self.axes):
4553
if a not in self.values_axes:
4554
continue
4555
index_vals, cvalues = result[i]
4556
4557
# we could have a multi-index constructor here
4558
# ensure_index doesn't recognized our list-of-tuples here
4559
if info.get("type") != "MultiIndex":
4560
cols = Index(index_vals)
4561
else:
4562
cols = MultiIndex.from_tuples(index_vals)
4563
4564
names = info.get("names")
4565
if names is not None:
4566
cols.set_names(names, inplace=True)
4567
4568
if self.is_transposed:
4569
values = cvalues
4570
index_ = cols
4571
cols_ = Index(index, name=getattr(index, "name", None))
4572
else:
4573
values = cvalues.T
4574
index_ = Index(index, name=getattr(index, "name", None))
4575
cols_ = cols
4576
4577
# if we have a DataIndexableCol, its shape will only be 1 dim
4578
if values.ndim == 1 and isinstance(values, np.ndarray):
4579
values = values.reshape((1, values.shape[0]))
4580
4581
if isinstance(values, np.ndarray):
4582
df = DataFrame(values.T, columns=cols_, index=index_)
4583
elif isinstance(values, Index):
4584
df = DataFrame(values, columns=cols_, index=index_)
4585
else:
4586
# Categorical
4587
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
4588
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4589
frames.append(df)
4590
4591
if len(frames) == 1:
4592
df = frames[0]
4593
else:
4594
df = concat(frames, axis=1)
4595
4596
selection = Selection(self, where=where, start=start, stop=stop)
4597
# apply the selection filters & axis orderings
4598
df = self.process_axes(df, selection=selection, columns=columns)
4599
4600
return df
4601
4602
4603
class AppendableSeriesTable(AppendableFrameTable):
4604
"""support the new appendable table formats"""
4605
4606
pandas_kind = "series_table"
4607
table_type = "appendable_series"
4608
ndim = 2
4609
obj_type = Series
4610
4611
@property
4612
def is_transposed(self) -> bool:
4613
return False
4614
4615
@classmethod
4616
def get_object(cls, obj, transposed: bool):
4617
return obj
4618
4619
def write(self, obj, data_columns=None, **kwargs):
4620
"""we are going to write this as a frame table"""
4621
if not isinstance(obj, DataFrame):
4622
name = obj.name or "values"
4623
obj = obj.to_frame(name)
4624
return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4625
4626
def read(
4627
self,
4628
where=None,
4629
columns=None,
4630
start: int | None = None,
4631
stop: int | None = None,
4632
) -> Series:
4633
4634
is_multi_index = self.is_multi_index
4635
if columns is not None and is_multi_index:
4636
assert isinstance(self.levels, list) # needed for mypy
4637
for n in self.levels:
4638
if n not in columns:
4639
columns.insert(0, n)
4640
s = super().read(where=where, columns=columns, start=start, stop=stop)
4641
if is_multi_index:
4642
s.set_index(self.levels, inplace=True)
4643
4644
s = s.iloc[:, 0]
4645
4646
# remove the default name
4647
if s.name == "values":
4648
s.name = None
4649
return s
4650
4651
4652
class AppendableMultiSeriesTable(AppendableSeriesTable):
4653
"""support the new appendable table formats"""
4654
4655
pandas_kind = "series_table"
4656
table_type = "appendable_multiseries"
4657
4658
def write(self, obj, **kwargs):
4659
"""we are going to write this as a frame table"""
4660
name = obj.name or "values"
4661
newobj, self.levels = self.validate_multiindex(obj)
4662
assert isinstance(self.levels, list) # for mypy
4663
cols = list(self.levels)
4664
cols.append(name)
4665
newobj.columns = Index(cols)
4666
return super().write(obj=newobj, **kwargs)
4667
4668
4669
class GenericTable(AppendableFrameTable):
4670
"""a table that read/writes the generic pytables table format"""
4671
4672
pandas_kind = "frame_table"
4673
table_type = "generic_table"
4674
ndim = 2
4675
obj_type = DataFrame
4676
levels: list[Hashable]
4677
4678
@property
4679
def pandas_type(self) -> str:
4680
return self.pandas_kind
4681
4682
@property
4683
def storable(self):
4684
return getattr(self.group, "table", None) or self.group
4685
4686
def get_attrs(self):
4687
"""retrieve our attributes"""
4688
self.non_index_axes = []
4689
self.nan_rep = None
4690
self.levels = []
4691
4692
self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4693
self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4694
self.data_columns = [a.name for a in self.values_axes]
4695
4696
@cache_readonly
4697
def indexables(self):
4698
"""create the indexables from the table description"""
4699
d = self.description
4700
4701
# TODO: can we get a typ for this? AFAICT it is the only place
4702
# where we aren't passing one
4703
# the index columns is just a simple index
4704
md = self.read_metadata("index")
4705
meta = "category" if md is not None else None
4706
index_col = GenericIndexCol(
4707
name="index", axis=0, table=self.table, meta=meta, metadata=md
4708
)
4709
4710
_indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
4711
4712
for i, n in enumerate(d._v_names):
4713
assert isinstance(n, str)
4714
4715
atom = getattr(d, n)
4716
md = self.read_metadata(n)
4717
meta = "category" if md is not None else None
4718
dc = GenericDataIndexableCol(
4719
name=n,
4720
pos=i,
4721
values=[n],
4722
typ=atom,
4723
table=self.table,
4724
meta=meta,
4725
metadata=md,
4726
)
4727
_indexables.append(dc)
4728
4729
return _indexables
4730
4731
def write(self, **kwargs):
4732
raise NotImplementedError("cannot write on an generic table")
4733
4734
4735
class AppendableMultiFrameTable(AppendableFrameTable):
4736
"""a frame with a multi-index"""
4737
4738
table_type = "appendable_multiframe"
4739
obj_type = DataFrame
4740
ndim = 2
4741
_re_levels = re.compile(r"^level_\d+$")
4742
4743
@property
4744
def table_type_short(self) -> str:
4745
return "appendable_multi"
4746
4747
def write(self, obj, data_columns=None, **kwargs):
4748
if data_columns is None:
4749
data_columns = []
4750
elif data_columns is True:
4751
data_columns = obj.columns.tolist()
4752
obj, self.levels = self.validate_multiindex(obj)
4753
assert isinstance(self.levels, list) # for mypy
4754
for n in self.levels:
4755
if n not in data_columns:
4756
data_columns.insert(0, n)
4757
return super().write(obj=obj, data_columns=data_columns, **kwargs)
4758
4759
def read(
4760
self,
4761
where=None,
4762
columns=None,
4763
start: int | None = None,
4764
stop: int | None = None,
4765
):
4766
4767
df = super().read(where=where, columns=columns, start=start, stop=stop)
4768
df = df.set_index(self.levels)
4769
4770
# remove names for 'level_%d'
4771
df.index = df.index.set_names(
4772
[None if self._re_levels.search(name) else name for name in df.index.names]
4773
)
4774
4775
return df
4776
4777
4778
def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:
4779
ax = obj._get_axis(axis)
4780
labels = ensure_index(labels)
4781
4782
# try not to reindex even if other is provided
4783
# if it equals our current index
4784
if other is not None:
4785
other = ensure_index(other)
4786
if (other is None or labels.equals(other)) and labels.equals(ax):
4787
return obj
4788
4789
labels = ensure_index(labels.unique())
4790
if other is not None:
4791
labels = ensure_index(other.unique()).intersection(labels, sort=False)
4792
if not labels.equals(ax):
4793
slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
4794
slicer[axis] = labels
4795
obj = obj.loc[tuple(slicer)]
4796
return obj
4797
4798
4799
# tz to/from coercion
4800
4801
4802
def _get_tz(tz: tzinfo) -> str | tzinfo:
4803
"""for a tz-aware type, return an encoded zone"""
4804
zone = timezones.get_timezone(tz)
4805
return zone
4806
4807
4808
def _set_tz(
4809
values: np.ndarray | Index,
4810
tz: str | tzinfo | None,
4811
coerce: bool = False,
4812
) -> np.ndarray | DatetimeIndex:
4813
"""
4814
coerce the values to a DatetimeIndex if tz is set
4815
preserve the input shape if possible
4816
4817
Parameters
4818
----------
4819
values : ndarray or Index
4820
tz : str or tzinfo
4821
coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4822
"""
4823
if isinstance(values, DatetimeIndex):
4824
# If values is tzaware, the tz gets dropped in the values.ravel()
4825
# call below (which returns an ndarray). So we are only non-lossy
4826
# if `tz` matches `values.tz`.
4827
assert values.tz is None or values.tz == tz
4828
4829
if tz is not None:
4830
if isinstance(values, DatetimeIndex):
4831
name = values.name
4832
values = values.asi8
4833
else:
4834
name = None
4835
values = values.ravel()
4836
4837
tz = _ensure_decoded(tz)
4838
values = DatetimeIndex(values, name=name)
4839
values = values.tz_localize("UTC").tz_convert(tz)
4840
elif coerce:
4841
values = np.asarray(values, dtype="M8[ns]")
4842
4843
# error: Incompatible return value type (got "Union[ndarray, Index]",
4844
# expected "Union[ndarray, DatetimeIndex]")
4845
return values # type: ignore[return-value]
4846
4847
4848
def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4849
assert isinstance(name, str)
4850
4851
index_name = index.name
4852
# error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
4853
# expected "Union[ExtensionArray, ndarray]"
4854
converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
4855
kind = _dtype_to_kind(dtype_name)
4856
atom = DataIndexableCol._get_atom(converted)
4857
4858
if isinstance(index, Int64Index) or needs_i8_conversion(index.dtype):
4859
# Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4860
# in which case "kind" is "integer", "integer", "datetime64",
4861
# "timedelta64", and "integer", respectively.
4862
return IndexCol(
4863
name,
4864
values=converted,
4865
kind=kind,
4866
typ=atom,
4867
freq=getattr(index, "freq", None),
4868
tz=getattr(index, "tz", None),
4869
index_name=index_name,
4870
)
4871
4872
if isinstance(index, MultiIndex):
4873
raise TypeError("MultiIndex not supported here!")
4874
4875
inferred_type = lib.infer_dtype(index, skipna=False)
4876
# we won't get inferred_type of "datetime64" or "timedelta64" as these
4877
# would go through the DatetimeIndex/TimedeltaIndex paths above
4878
4879
values = np.asarray(index)
4880
4881
if inferred_type == "date":
4882
converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
4883
return IndexCol(
4884
name, converted, "date", _tables().Time32Col(), index_name=index_name
4885
)
4886
elif inferred_type == "string":
4887
4888
converted = _convert_string_array(values, encoding, errors)
4889
itemsize = converted.dtype.itemsize
4890
return IndexCol(
4891
name,
4892
converted,
4893
"string",
4894
_tables().StringCol(itemsize),
4895
index_name=index_name,
4896
)
4897
4898
elif inferred_type in ["integer", "floating"]:
4899
return IndexCol(
4900
name, values=converted, kind=kind, typ=atom, index_name=index_name
4901
)
4902
else:
4903
assert isinstance(converted, np.ndarray) and converted.dtype == object
4904
assert kind == "object", kind
4905
atom = _tables().ObjectAtom()
4906
return IndexCol(name, converted, kind, atom, index_name=index_name)
4907
4908
4909
def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
4910
index: Index | np.ndarray
4911
4912
if kind == "datetime64":
4913
index = DatetimeIndex(data)
4914
elif kind == "timedelta64":
4915
index = TimedeltaIndex(data)
4916
elif kind == "date":
4917
try:
4918
index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
4919
except (ValueError):
4920
index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
4921
elif kind in ("integer", "float"):
4922
index = np.asarray(data)
4923
elif kind in ("string"):
4924
index = _unconvert_string_array(
4925
data, nan_rep=None, encoding=encoding, errors=errors
4926
)
4927
elif kind == "object":
4928
index = np.asarray(data[0])
4929
else: # pragma: no cover
4930
raise ValueError(f"unrecognized index type {kind}")
4931
return index
4932
4933
4934
def _maybe_convert_for_string_atom(
4935
name: str,
4936
bvalues: ArrayLike,
4937
existing_col,
4938
min_itemsize,
4939
nan_rep,
4940
encoding,
4941
errors,
4942
columns: list[str],
4943
):
4944
4945
if bvalues.dtype != object:
4946
return bvalues
4947
4948
bvalues = cast(np.ndarray, bvalues)
4949
4950
dtype_name = bvalues.dtype.name
4951
inferred_type = lib.infer_dtype(bvalues, skipna=False)
4952
4953
if inferred_type == "date":
4954
raise TypeError("[date] is not implemented as a table column")
4955
elif inferred_type == "datetime":
4956
# after GH#8260
4957
# this only would be hit for a multi-timezone dtype which is an error
4958
raise TypeError(
4959
"too many timezones in this block, create separate data columns"
4960
)
4961
4962
elif not (inferred_type == "string" or dtype_name == "object"):
4963
return bvalues
4964
4965
mask = isna(bvalues)
4966
data = bvalues.copy()
4967
data[mask] = nan_rep
4968
4969
# see if we have a valid string type
4970
inferred_type = lib.infer_dtype(data, skipna=False)
4971
if inferred_type != "string":
4972
4973
# we cannot serialize this data, so report an exception on a column
4974
# by column basis
4975
4976
# expected behaviour:
4977
# search block for a non-string object column by column
4978
for i in range(data.shape[0]):
4979
col = data[i]
4980
inferred_type = lib.infer_dtype(col, skipna=False)
4981
if inferred_type != "string":
4982
error_column_label = columns[i] if len(columns) > i else f"No.{i}"
4983
raise TypeError(
4984
f"Cannot serialize the column [{error_column_label}]\n"
4985
f"because its data contents are not [string] but "
4986
f"[{inferred_type}] object dtype"
4987
)
4988
4989
# itemsize is the maximum length of a string (along any dimension)
4990
4991
data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
4992
itemsize = data_converted.itemsize
4993
4994
# specified min_itemsize?
4995
if isinstance(min_itemsize, dict):
4996
min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
4997
itemsize = max(min_itemsize or 0, itemsize)
4998
4999
# check for column in the values conflicts
5000
if existing_col is not None:
5001
eci = existing_col.validate_col(itemsize)
5002
if eci is not None and eci > itemsize:
5003
itemsize = eci
5004
5005
data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
5006
return data_converted
5007
5008
5009
def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
5010
"""
5011
Take a string-like that is object dtype and coerce to a fixed size string type.
5012
5013
Parameters
5014
----------
5015
data : np.ndarray[object]
5016
encoding : str
5017
errors : str
5018
Handler for encoding errors.
5019
5020
Returns
5021
-------
5022
np.ndarray[fixed-length-string]
5023
"""
5024
# encode if needed
5025
if len(data):
5026
data = (
5027
Series(data.ravel())
5028
.str.encode(encoding, errors)
5029
._values.reshape(data.shape)
5030
)
5031
5032
# create the sized dtype
5033
ensured = ensure_object(data.ravel())
5034
itemsize = max(1, libwriters.max_len_string_array(ensured))
5035
5036
data = np.asarray(data, dtype=f"S{itemsize}")
5037
return data
5038
5039
5040
def _unconvert_string_array(
5041
data: np.ndarray, nan_rep, encoding: str, errors: str
5042
) -> np.ndarray:
5043
"""
5044
Inverse of _convert_string_array.
5045
5046
Parameters
5047
----------
5048
data : np.ndarray[fixed-length-string]
5049
nan_rep : the storage repr of NaN
5050
encoding : str
5051
errors : str
5052
Handler for encoding errors.
5053
5054
Returns
5055
-------
5056
np.ndarray[object]
5057
Decoded data.
5058
"""
5059
shape = data.shape
5060
data = np.asarray(data.ravel(), dtype=object)
5061
5062
if len(data):
5063
5064
itemsize = libwriters.max_len_string_array(ensure_object(data))
5065
dtype = f"U{itemsize}"
5066
5067
if isinstance(data[0], bytes):
5068
data = Series(data).str.decode(encoding, errors=errors)._values
5069
else:
5070
data = data.astype(dtype, copy=False).astype(object, copy=False)
5071
5072
if nan_rep is None:
5073
nan_rep = "nan"
5074
5075
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
5076
return data.reshape(shape)
5077
5078
5079
def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
5080
assert isinstance(val_kind, str), type(val_kind)
5081
if _need_convert(val_kind):
5082
conv = _get_converter(val_kind, encoding, errors)
5083
values = conv(values)
5084
return values
5085
5086
5087
def _get_converter(kind: str, encoding: str, errors: str):
5088
if kind == "datetime64":
5089
return lambda x: np.asarray(x, dtype="M8[ns]")
5090
elif kind == "string":
5091
return lambda x: _unconvert_string_array(
5092
x, nan_rep=None, encoding=encoding, errors=errors
5093
)
5094
else: # pragma: no cover
5095
raise ValueError(f"invalid kind {kind}")
5096
5097
5098
def _need_convert(kind: str) -> bool:
5099
if kind in ("datetime64", "string"):
5100
return True
5101
return False
5102
5103
5104
def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
5105
"""
5106
Prior to 0.10.1, we named values blocks like: values_block_0 an the
5107
name values_0, adjust the given name if necessary.
5108
5109
Parameters
5110
----------
5111
name : str
5112
version : Tuple[int, int, int]
5113
5114
Returns
5115
-------
5116
str
5117
"""
5118
if isinstance(version, str) or len(version) < 3:
5119
raise ValueError("Version is incorrect, expected sequence of 3 integers.")
5120
5121
if version[0] == 0 and version[1] <= 10 and version[2] == 0:
5122
m = re.search(r"values_block_(\d+)", name)
5123
if m:
5124
grp = m.groups()[0]
5125
name = f"values_{grp}"
5126
return name
5127
5128
5129
def _dtype_to_kind(dtype_str: str) -> str:
5130
"""
5131
Find the "kind" string describing the given dtype name.
5132
"""
5133
dtype_str = _ensure_decoded(dtype_str)
5134
5135
if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
5136
kind = "string"
5137
elif dtype_str.startswith("float"):
5138
kind = "float"
5139
elif dtype_str.startswith("complex"):
5140
kind = "complex"
5141
elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
5142
kind = "integer"
5143
elif dtype_str.startswith("datetime64"):
5144
kind = "datetime64"
5145
elif dtype_str.startswith("timedelta"):
5146
kind = "timedelta64"
5147
elif dtype_str.startswith("bool"):
5148
kind = "bool"
5149
elif dtype_str.startswith("category"):
5150
kind = "category"
5151
elif dtype_str.startswith("period"):
5152
# We store the `freq` attr so we can restore from integers
5153
kind = "integer"
5154
elif dtype_str == "object":
5155
kind = "object"
5156
else:
5157
raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
5158
5159
return kind
5160
5161
5162
def _get_data_and_dtype_name(data: ArrayLike):
5163
"""
5164
Convert the passed data into a storable form and a dtype string.
5165
"""
5166
if isinstance(data, Categorical):
5167
data = data.codes
5168
5169
# For datetime64tz we need to drop the TZ in tests TODO: why?
5170
dtype_name = data.dtype.name.split("[")[0]
5171
5172
if data.dtype.kind in ["m", "M"]:
5173
data = np.asarray(data.view("i8"))
5174
# TODO: we used to reshape for the dt64tz case, but no longer
5175
# doing that doesn't seem to break anything. why?
5176
5177
elif isinstance(data, PeriodIndex):
5178
data = data.asi8
5179
5180
data = np.asarray(data)
5181
return data, dtype_name
5182
5183
5184
class Selection:
5185
"""
5186
Carries out a selection operation on a tables.Table object.
5187
5188
Parameters
5189
----------
5190
table : a Table object
5191
where : list of Terms (or convertible to)
5192
start, stop: indices to start and/or stop selection
5193
5194
"""
5195
5196
def __init__(
5197
self,
5198
table: Table,
5199
where=None,
5200
start: int | None = None,
5201
stop: int | None = None,
5202
):
5203
self.table = table
5204
self.where = where
5205
self.start = start
5206
self.stop = stop
5207
self.condition = None
5208
self.filter = None
5209
self.terms = None
5210
self.coordinates = None
5211
5212
if is_list_like(where):
5213
5214
# see if we have a passed coordinate like
5215
with suppress(ValueError):
5216
inferred = lib.infer_dtype(where, skipna=False)
5217
if inferred == "integer" or inferred == "boolean":
5218
where = np.asarray(where)
5219
if where.dtype == np.bool_:
5220
start, stop = self.start, self.stop
5221
if start is None:
5222
start = 0
5223
if stop is None:
5224
stop = self.table.nrows
5225
self.coordinates = np.arange(start, stop)[where]
5226
elif issubclass(where.dtype.type, np.integer):
5227
if (self.start is not None and (where < self.start).any()) or (
5228
self.stop is not None and (where >= self.stop).any()
5229
):
5230
raise ValueError(
5231
"where must have index locations >= start and < stop"
5232
)
5233
self.coordinates = where
5234
5235
if self.coordinates is None:
5236
5237
self.terms = self.generate(where)
5238
5239
# create the numexpr & the filter
5240
if self.terms is not None:
5241
self.condition, self.filter = self.terms.evaluate()
5242
5243
def generate(self, where):
5244
"""where can be a : dict,list,tuple,string"""
5245
if where is None:
5246
return None
5247
5248
q = self.table.queryables()
5249
try:
5250
return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5251
except NameError as err:
5252
# raise a nice message, suggesting that the user should use
5253
# data_columns
5254
qkeys = ",".join(q.keys())
5255
msg = dedent(
5256
f"""\
5257
The passed where expression: {where}
5258
contains an invalid variable reference
5259
all of the variable references must be a reference to
5260
an axis (e.g. 'index' or 'columns'), or a data_column
5261
The currently defined references are: {qkeys}
5262
"""
5263
)
5264
raise ValueError(msg) from err
5265
5266
def select(self):
5267
"""
5268
generate the selection
5269
"""
5270
if self.condition is not None:
5271
return self.table.table.read_where(
5272
self.condition.format(), start=self.start, stop=self.stop
5273
)
5274
elif self.coordinates is not None:
5275
return self.table.table.read_coordinates(self.coordinates)
5276
return self.table.table.read(start=self.start, stop=self.stop)
5277
5278
def select_coords(self):
5279
"""
5280
generate the selection
5281
"""
5282
start, stop = self.start, self.stop
5283
nrows = self.table.nrows
5284
if start is None:
5285
start = 0
5286
elif start < 0:
5287
start += nrows
5288
if stop is None:
5289
stop = nrows
5290
elif stop < 0:
5291
stop += nrows
5292
5293
if self.condition is not None:
5294
return self.table.table.get_where_list(
5295
self.condition.format(), start=start, stop=stop, sort=True
5296
)
5297
elif self.coordinates is not None:
5298
return self.coordinates
5299
5300
return np.arange(start, stop)
5301
5302