//! Row format as defined in `arrow-rs`.1//! This currently partially implements that format only for needed types.2//! For completeness sake the format as defined by `arrow-rs` is as followed:3//! Converts [`ArrayRef`] columns into a [row-oriented](self) format.4//!5//! ## Overview6//!7//! The row format is a variable length byte sequence created by8//! concatenating the encoded form of each column. The encoding for9//! each column depends on its datatype (and sort options).10//!11//! The encoding is carefully designed in such a way that escaping is12//! unnecessary: it is never ambiguous as to whether a byte is part of13//! a sentinel (e.g. null) or a value.14//!15//! ## Unsigned Integer Encoding16//!17//! A null integer is encoded as a `0_u8`, followed by a zero-ed number of bytes corresponding18//! to the integer's length.19//!20//! A valid integer is encoded as `1_u8`, followed by the big-endian representation of the21//! integer.22//!23//! ```text24//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐25//! 3 │03│00│00│00│ │01│00│00│00│03│26//! └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘27//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐28//! 258 │02│01│00│00│ │01│00│00│01│02│29//! └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘30//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐31//! 23423 │7F│5B│00│00│ │01│00│00│5B│7F│32//! └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘33//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐34//! NULL │??│??│??│??│ │00│00│00│00│00│35//! └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘36//!37//! 32-bit (4 bytes) Row Format38//! Value Little Endian39//! ```40//!41//! ## Signed Integer Encoding42//!43//! Signed integers have their most significant sign bit flipped, and are then encoded in the44//! same manner as an unsigned integer.45//!46//! ```text47//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐48//! 5 │05│00│00│00│ │05│00│00│80│ │01│80│00│00│05│49//! └──┴──┴──┴──┘ └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘50//! ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐51//! -5 │FB│FF│FF│FF│ │FB│FF│FF│7F│ │01│7F│FF│FF│FB│52//! └──┴──┴──┴──┘ └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘53//!54//! Value 32-bit (4 bytes) High bit flipped Row Format55//! Little Endian56//! ```57//!58//! ## Float Encoding59//!60//! Floats are converted from IEEE 754 representation to a signed integer representation61//! by flipping all bar the sign bit if they are negative after normalizing nans62//! and signed zeros to a canonical representation.63//!64//! They are then encoded in the same manner as a signed integer.65//!66//! ## Fixed Length Bytes Encoding67//!68//! Fixed length bytes are encoded in the same fashion as primitive types above.69//!70//! For a fixed length array of length `n`:71//!72//! A null is encoded as `0_u8` null sentinel followed by `n` `0_u8` bytes73//!74//! A valid value is encoded as `1_u8` followed by the value bytes75//!76//! ## Variable Length Bytes (including Strings) Encoding77//!78//! A null is encoded as a `0_u8`.79//!80//! An empty byte array is encoded as `1_u8`.81//!82//! A non-null, non-empty byte array is encoded as `2_u8` followed by the byte array83//! encoded using a block based scheme described below.84//!85//! The byte array is broken up into 32-byte blocks, each block is written in turn86//! to the output, followed by `0xFF_u8`. The final block is padded to 32-bytes87//! with `0_u8` and written to the output, followed by the un-padded length in bytes88//! of this final block as a `u8`.89//!90//! Note the following example encodings use a block size of 4 bytes,91//! as opposed to 32 bytes for brevity:92//!93//! ```text94//! ┌───┬───┬───┬───┬───┬───┐95//! "MEEP" │02 │'M'│'E'│'E'│'P'│04 │96//! └───┴───┴───┴───┴───┴───┘97//!98//! ┌───┐99//! "" │01 |100//! └───┘101//!102//! NULL ┌───┐103//! │00 │104//! └───┘105//!106//! "Defenestration" ┌───┬───┬───┬───┬───┬───┐107//! │02 │'D'│'e'│'f'│'e'│FF │108//! └───┼───┼───┼───┼───┼───┤109//! │'n'│'e'│'s'│'t'│FF │110//! ├───┼───┼───┼───┼───┤111//! │'r'│'a'│'t'│'r'│FF │112//! ├───┼───┼───┼───┼───┤113//! │'a'│'t'│'i'│'o'│FF │114//! ├───┼───┼───┼───┼───┤115//! │'n'│00 │00 │00 │01 │116//! └───┴───┴───┴───┴───┘117//! ```118//!119//! This approach is loosely inspired by [COBS] encoding, and chosen over more traditional120//! [byte stuffing] as it is more amenable to vectorisation, in particular AVX-256.121//!122//! For the unordered row encoding we use a simpler scheme, we prepend the length123//! encoded as 4 bytes followed by the raw data, with nulls being marked with a124//! length of u32::MAX.125//!126//! ## Dictionary Encoding127//!128//! [`RowsEncoded`] needs to support converting dictionary encoded arrays with unsorted, and129//! potentially distinct dictionaries. One simple mechanism to avoid this would be to reverse130//! the dictionary encoding, and encode the array values directly, however, this would lose131//! the benefits of dictionary encoding to reduce memory and CPU consumption.132//!133//! As such the [`RowsEncoded`] creates an order-preserving mapping134//! for each dictionary encoded column, which allows new dictionary135//! values to be added whilst preserving the sort order.136//!137//! A null dictionary value is encoded as `0_u8`.138//!139//! A non-null dictionary value is encoded as `1_u8` followed by a null-terminated byte array140//! key determined by the order-preserving dictionary encoding141//!142//! ```text143//! ┌──────────┐ ┌─────┐144//! │ "Bar" │ ───────────────▶│ 01 │145//! └──────────┘ └─────┘146//! ┌──────────┐ ┌─────┬─────┐147//! │"Fabulous"│ ───────────────▶│ 01 │ 02 │148//! └──────────┘ └─────┴─────┘149//! ┌──────────┐ ┌─────┐150//! │ "Soup" │ ───────────────▶│ 05 │151//! └──────────┘ └─────┘152//! ┌──────────┐ ┌─────┐153//! │ "ZZ" │ ───────────────▶│ 07 │154//! └──────────┘ └─────┘155//!156//! Example Order Preserving Mapping157//! ```158//! Using the map above, the corresponding row format will be159//!160//! ```text161//! ┌─────┬─────┬─────┬─────┐162//! "Fabulous" │ 01 │ 03 │ 05 │ 00 │163//! └─────┴─────┴─────┴─────┘164//!165//! ┌─────┬─────┬─────┐166//! "ZZ" │ 01 │ 07 │ 00 │167//! └─────┴─────┴─────┘168//!169//! ┌─────┐170//! NULL │ 00 │171//! └─────┘172//!173//! Input Row Format174//! ```175//!176//! ## Struct Encoding177//!178//! A null is encoded as a `0_u8`.179//!180//! A valid value is encoded as `1_u8` followed by the row encoding of each child.181//!182//! This encoding effectively flattens the schema in a depth-first fashion.183//!184//! For example185//!186//! ```text187//! ┌───────┬────────────────────────┬───────┐188//! │ Int32 │ Struct[Int32, Float32] │ Int32 │189//! └───────┴────────────────────────┴───────┘190//! ```191//!192//! Is encoded as193//!194//! ```text195//! ┌───────┬───────────────┬───────┬─────────┬───────┐196//! │ Int32 │ Null Sentinel │ Int32 │ Float32 │ Int32 │197//! └───────┴───────────────┴───────┴─────────┴───────┘198//! ```199//!200//! ## List Encoding201//!202//! Lists are encoded by first encoding all child elements to the row format.203//!204//! A "canonical byte array" is then constructed by concatenating the row205//! encodings of all their elements into a single binary array, followed206//! by the lengths of each encoded row, and the number of elements, encoded207//! as big endian `u32`.208//!209//! This canonical byte array is then encoded using the variable length byte210//! encoding described above.211//!212//! _The lengths are not strictly necessary but greatly simplify decode, they213//! may be removed in a future iteration_.214//!215//! For example given:216//!217//! ```text218//! [1_u8, 2_u8, 3_u8]219//! [1_u8, null]220//! []221//! null222//! ```223//!224//! The elements would be converted to:225//!226//! ```text227//! ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐228//! 1 │01│01│ 2 │01│02│ 3 │01│03│ 1 │01│01│ null │00│00│229//! └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘230//!```231//!232//! Which would be grouped into the following canonical byte arrays:233//!234//! ```text235//! ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐236//! [1_u8, 2_u8, 3_u8] │01│01│01│02│01│03│00│00│00│02│00│00│00│02│00│00│00│02│00│00│00│03│237//! └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘238//! └──── rows ────┘ └───────── row lengths ─────────┘ └─ count ─┘239//!240//! ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐241//! [1_u8, null] │01│01│00│00│00│00│00│02│00│00│00│02│00│00│00│02│242//! └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘243//!```244//!245//! With `[]` represented by an empty byte array, and `null` a null byte array.246//!247//! These byte arrays will then be encoded using the variable length byte encoding248//! described above.249//!250//! # Ordering251//!252//! ## Float Ordering253//!254//! Floats are totally ordered just like in the rest of Polars,255//! -inf < neg < -0.0 = 0.0 < pos < inf < nan, with all nans being equal.256//!257//! ## Null Ordering258//!259//! The encoding described above will order nulls first, this can be inverted by representing260//! nulls as `0xFF_u8` instead of `0_u8`261//!262//! ## Reverse Column Ordering263//!264//! The order of a given column can be reversed by negating the encoded bytes of non-null values265//!266//! [COBS]: https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing267//! [byte stuffing]: https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing268269extern crate core;270271pub mod decode;272pub mod encode;273pub(crate) mod fixed;274mod row;275mod utils;276pub(crate) mod variable;277mod widths;278279use arrow::array::*;280pub type ArrayRef = Box<dyn Array>;281282pub use encode::{283convert_columns, convert_columns_amortized, convert_columns_amortized_no_order,284convert_columns_no_order,285};286pub use row::{RowEncodingCategoricalContext, RowEncodingContext, RowEncodingOptions, RowsEncoded};287288289