CoCalc -- transpose.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/row/transpose.rs
⁶⁹⁴⁰ views
1
use std::borrow::Cow;
2

3
use either::Either;
4

5
use super::*;
6

7
impl DataFrame {
8
    pub(crate) fn transpose_from_dtype(
9
        &self,
10
        dtype: &DataType,
11
        keep_names_as: Option<PlSmallStr>,
12
        names_out: &[PlSmallStr],
13
    ) -> PolarsResult<DataFrame> {
14
        let new_width = self.height();
15
        let new_height = self.width();
16
        // Allocate space for the transposed columns, putting the "row names" first if needed
17
        let mut cols_t = match keep_names_as {
18
            None => Vec::<Column>::with_capacity(new_width),
19
            Some(name) => {
20
                let mut tmp = Vec::<Column>::with_capacity(new_width + 1);
21
                tmp.push(
22
                    StringChunked::from_iter_values(
23
                        name,
24
                        self.get_column_names_owned().into_iter(),
25
                    )
26
                    .into_column(),
27
                );
28
                tmp
29
            },
30
        };
31

32
        let cols = &self.columns;
33
        match dtype {
34
            #[cfg(feature = "dtype-i8")]
35
            DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
36
            #[cfg(feature = "dtype-i16")]
37
            DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
38
            DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
39
            DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
40
            #[cfg(feature = "dtype-u8")]
41
            DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
42
            #[cfg(feature = "dtype-u16")]
43
            DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
44
            DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
45
            DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
46
            DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
47
            DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
48
            #[cfg(feature = "object")]
49
            DataType::Object(_) => {
50
                // this requires to support `Object` in Series::iter which we don't yet
51
                polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'")
52
            },
53
            _ => {
54
                let phys_dtype = dtype.to_physical();
55
                let mut buffers = (0..new_width)
56
                    .map(|_| {
57
                        let buf: AnyValueBufferTrusted = (&phys_dtype, new_height).into();
58
                        buf
59
                    })
60
                    .collect::<Vec<_>>();
61

62
                let columns = self
63
                    .materialized_column_iter()
64
                    // first cast to supertype before casting to physical to ensure units are correct
65
                    .map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap())
66
                    .collect::<Vec<_>>();
67

68
                // this is very expensive. A lot of cache misses here.
69
                // This is the part that is performance critical.
70
                for s in columns {
71
                    polars_ensure!(s.dtype() == &phys_dtype, ComputeError: "cannot transpose with supertype: {}", dtype);
72
                    s.iter().zip(buffers.iter_mut()).for_each(|(av, buf)| {
73
                        // SAFETY: we checked the type and we borrow
74
                        unsafe {
75
                            buf.add_unchecked_borrowed_physical(&av);
76
                        }
77
                    });
78
                }
79
                cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
80
                    // SAFETY: we are casting back to the supertype
81
                    let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() };
82
                    s.rename(name.clone());
83
                    s.into()
84
                }));
85
            },
86
        };
87
        Ok(unsafe { DataFrame::new_no_checks(new_height, cols_t) })
88
    }
89

90
    pub fn transpose(
91
        &mut self,
92
        keep_names_as: Option<&str>,
93
        new_col_names: Option<Either<String, Vec<String>>>,
94
    ) -> PolarsResult<DataFrame> {
95
        let new_col_names = match new_col_names {
96
            None => None,
97
            Some(Either::Left(v)) => Some(Either::Left(v.into())),
98
            Some(Either::Right(v)) => Some(Either::Right(
99
                v.into_iter().map(Into::into).collect::<Vec<_>>(),
100
            )),
101
        };
102

103
        self.transpose_impl(keep_names_as, new_col_names)
104
    }
105
    /// Transpose a DataFrame. This is a very expensive operation.
106
    pub fn transpose_impl(
107
        &mut self,
108
        keep_names_as: Option<&str>,
109
        new_col_names: Option<Either<PlSmallStr, Vec<PlSmallStr>>>,
110
    ) -> PolarsResult<DataFrame> {
111
        // We must iterate columns as [`AnyValue`], so we must be contiguous.
112
        self.as_single_chunk_par();
113

114
        let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
115
        let names_out = match new_col_names {
116
            None => (0..self.height())
117
                .map(|i| format_pl_smallstr!("column_{i}"))
118
                .collect(),
119
            Some(cn) => match cn {
120
                Either::Left(name) => {
121
                    let new_names = self.column(name.as_str()).and_then(|x| x.str())?;
122
                    polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values");
123
                    df = Cow::Owned(self.drop(name.as_str())?);
124
                    new_names
125
                        .into_no_null_iter()
126
                        .map(PlSmallStr::from_str)
127
                        .collect()
128
                },
129
                Either::Right(names) => {
130
                    polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
131
                    names
132
                },
133
            },
134
        };
135
        if let Some(cn) = keep_names_as {
136
            // Check that the column name we're using for the original column names is unique before
137
            // wasting time transposing
138
            polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
139
        }
140
        polars_ensure!(
141
            df.height() != 0 && df.width() != 0,
142
            NoData: "unable to transpose an empty DataFrame"
143
        );
144
        let dtype = df.get_supertype().unwrap()?;
145
        df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out)
146
    }
147
}
148

149
#[inline]
150
unsafe fn add_value<T: NumericNative>(
151
    values_buf_ptr: usize,
152
    col_idx: usize,
153
    row_idx: usize,
154
    value: T,
155
) {
156
    let vec_ref: &mut Vec<Vec<T>> = &mut *(values_buf_ptr as *mut Vec<Vec<T>>);
157
    let column = vec_ref.get_unchecked_mut(col_idx);
158
    let el_ptr = column.as_mut_ptr();
159
    *el_ptr.add(row_idx) = value;
160
}
161

162
// This just fills a pre-allocated mutable series vector, which may have a name column.
163
// Nothing is returned and the actual DataFrame is constructed above.
164
pub(super) fn numeric_transpose<T: PolarsNumericType>(
165
    cols: &[Column],
166
    names_out: &[PlSmallStr],
167
    cols_t: &mut Vec<Column>,
168
) {
169
    let new_width = cols[0].len();
170
    let new_height = cols.len();
171

172
    let has_nulls = cols.iter().any(|s| s.null_count() > 0);
173

174
    let mut values_buf: Vec<Vec<T::Native>> = (0..new_width)
175
        .map(|_| Vec::with_capacity(new_height))
176
        .collect();
177
    let mut validity_buf: Vec<_> = if has_nulls {
178
        // we first use bools instead of bits, because we can access these in parallel without aliasing
179
        (0..new_width).map(|_| vec![true; new_height]).collect()
180
    } else {
181
        (0..new_width).map(|_| vec![]).collect()
182
    };
183

184
    // work with *mut pointers because we it is UB write to &refs.
185
    let values_buf_ptr = &mut values_buf as *mut Vec<Vec<T::Native>> as usize;
186
    let validity_buf_ptr = &mut validity_buf as *mut Vec<Vec<bool>> as usize;
187

188
    POOL.install(|| {
189
        cols.iter()
190
            .map(Column::as_materialized_series)
191
            .enumerate()
192
            .for_each(|(row_idx, s)| {
193
                let s = s.cast(&T::get_static_dtype()).unwrap();
194
                let ca = s.unpack::<T>().unwrap();
195

196
                // SAFETY:
197
                // we access in parallel, but every access is unique, so we don't break aliasing rules
198
                // we also ensured we allocated enough memory, so we never reallocate and thus
199
                // the pointers remain valid.
200
                if has_nulls {
201
                    for (col_idx, opt_v) in ca.iter().enumerate() {
202
                        match opt_v {
203
                            None => unsafe {
204
                                let validity_vec: &mut Vec<Vec<bool>> =
205
                                    &mut *(validity_buf_ptr as *mut Vec<Vec<bool>>);
206
                                let column = validity_vec.get_unchecked_mut(col_idx);
207
                                let el_ptr = column.as_mut_ptr();
208
                                *el_ptr.add(row_idx) = false;
209
                                // we must initialize this memory otherwise downstream code
210
                                // might access uninitialized memory when the masked out values
211
                                // are changed.
212
                                add_value(values_buf_ptr, col_idx, row_idx, T::Native::default());
213
                            },
214
                            Some(v) => unsafe {
215
                                add_value(values_buf_ptr, col_idx, row_idx, v);
216
                            },
217
                        }
218
                    }
219
                } else {
220
                    for (col_idx, v) in ca.into_no_null_iter().enumerate() {
221
                        unsafe {
222
                            let column: &mut Vec<Vec<T::Native>> =
223
                                &mut *(values_buf_ptr as *mut Vec<Vec<T::Native>>);
224
                            let el_ptr = column.get_unchecked_mut(col_idx).as_mut_ptr();
225
                            *el_ptr.add(row_idx) = v;
226
                        }
227
                    }
228
                }
229
            })
230
    });
231

232
    let par_iter = values_buf
233
        .into_par_iter()
234
        .zip(validity_buf)
235
        .zip(names_out)
236
        .map(|((mut values, validity), name)| {
237
            // SAFETY:
238
            // all values are written we can now set len
239
            unsafe {
240
                values.set_len(new_height);
241
            }
242

243
            let validity = if has_nulls {
244
                let validity = Bitmap::from_trusted_len_iter(validity.iter().copied());
245
                if validity.unset_bits() > 0 {
246
                    Some(validity)
247
                } else {
248
                    None
249
                }
250
            } else {
251
                None
252
            };
253

254
            let arr = PrimitiveArray::<T::Native>::new(
255
                T::get_static_dtype().to_arrow(CompatLevel::newest()),
256
                values.into(),
257
                validity,
258
            );
259
            ChunkedArray::<T>::with_chunk(name.clone(), arr).into_column()
260
        });
261
    POOL.install(|| cols_t.par_extend(par_iter));
262
}
263

264
#[cfg(test)]
265
mod test {
266
    use super::*;
267

268
    #[test]
269
    fn test_transpose() -> PolarsResult<()> {
270
        let mut df = df![
271
            "a" => [1, 2, 3],
272
            "b" => [10, 20, 30],
273
        ]?;
274

275
        let out = df.transpose(None, None)?;
276
        let expected = df![
277
            "column_0" => [1, 10],
278
            "column_1" => [2, 20],
279
            "column_2" => [3, 30],
280

281
        ]?;
282
        assert!(out.equals_missing(&expected));
283

284
        let mut df = df![
285
            "a" => [Some(1), None, Some(3)],
286
            "b" => [Some(10), Some(20), None],
287
        ]?;
288
        let out = df.transpose(None, None)?;
289
        let expected = df![
290
            "column_0" => [1, 10],
291
            "column_1" => [None, Some(20)],
292
            "column_2" => [Some(3), None],
293

294
        ]?;
295
        assert!(out.equals_missing(&expected));
296

297
        let mut df = df![
298
            "a" => ["a", "b", "c"],
299
            "b" => [Some(10), Some(20), None],
300
        ]?;
301
        let out = df.transpose(None, None)?;
302
        let expected = df![
303
            "column_0" => ["a", "10"],
304
            "column_1" => ["b", "20"],
305
            "column_2" => [Some("c"), None],
306

307
        ]?;
308
        assert!(out.equals_missing(&expected));
309
        Ok(())
310
    }
311
}
312

313
Product

Resources

Company