Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/row/transpose.rs
8458 views
1
use std::borrow::Cow;
2
3
use either::Either;
4
5
use super::*;
6
7
impl DataFrame {
8
pub(crate) fn transpose_from_dtype(
9
&self,
10
dtype: &DataType,
11
keep_names_as: Option<PlSmallStr>,
12
names_out: &[PlSmallStr],
13
) -> PolarsResult<DataFrame> {
14
let new_width = self.height();
15
let new_height = self.width();
16
// Allocate space for the transposed columns, putting the "row names" first if needed
17
let mut cols_t = match keep_names_as {
18
None => Vec::<Column>::with_capacity(new_width),
19
Some(name) => {
20
let mut tmp = Vec::<Column>::with_capacity(new_width + 1);
21
tmp.push(
22
StringChunked::from_iter_values(
23
name,
24
self.get_column_names_owned().into_iter(),
25
)
26
.into_column(),
27
);
28
tmp
29
},
30
};
31
32
let cols = self.columns();
33
match dtype {
34
#[cfg(feature = "dtype-i8")]
35
DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
36
#[cfg(feature = "dtype-i16")]
37
DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
38
DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
39
DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
40
#[cfg(feature = "dtype-u8")]
41
DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
42
#[cfg(feature = "dtype-u16")]
43
DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
44
DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
45
DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
46
DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
47
DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
48
#[cfg(feature = "object")]
49
DataType::Object(_) => {
50
// this requires to support `Object` in Series::iter which we don't yet
51
polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'")
52
},
53
_ => {
54
let phys_dtype = dtype.to_physical();
55
let mut buffers = (0..new_width)
56
.map(|_| {
57
let buf: AnyValueBufferTrusted = (&phys_dtype, new_height).into();
58
buf
59
})
60
.collect::<Vec<_>>();
61
62
let columns = self
63
.materialized_column_iter()
64
// first cast to supertype before casting to physical to ensure units are correct
65
.map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap())
66
.collect::<Vec<_>>();
67
68
// this is very expensive. A lot of cache misses here.
69
// This is the part that is performance critical.
70
for series in &columns {
71
polars_ensure!(
72
series.dtype() == &phys_dtype,
73
ComputeError: "cannot transpose with supertype: {}", dtype
74
);
75
for (av, buf) in series.iter().zip(buffers.iter_mut()) {
76
// SAFETY: we checked the type and we borrow
77
unsafe {
78
buf.add_unchecked_borrowed_physical(&av);
79
}
80
}
81
}
82
cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
83
// SAFETY: we are casting back to the supertype
84
let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() };
85
s.rename(name.clone());
86
s.into()
87
}));
88
},
89
};
90
91
DataFrame::new(new_height, cols_t)
92
}
93
94
pub fn transpose(
95
&mut self,
96
keep_names_as: Option<&str>,
97
new_col_names: Option<Either<String, Vec<String>>>,
98
) -> PolarsResult<DataFrame> {
99
let new_col_names = match new_col_names {
100
None => None,
101
Some(Either::Left(v)) => Some(Either::Left(v.into())),
102
Some(Either::Right(v)) => Some(Either::Right(
103
v.into_iter().map(Into::into).collect::<Vec<_>>(),
104
)),
105
};
106
107
self.transpose_impl(keep_names_as, new_col_names)
108
}
109
/// Transpose a DataFrame. This is a very expensive operation.
110
pub fn transpose_impl(
111
&mut self,
112
keep_names_as: Option<&str>,
113
new_col_names: Option<Either<PlSmallStr, Vec<PlSmallStr>>>,
114
) -> PolarsResult<DataFrame> {
115
// We must iterate columns as [`AnyValue`], so we must be contiguous.
116
self.rechunk_mut_par();
117
118
let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
119
let names_out = match new_col_names {
120
None => (0..self.height())
121
.map(|i| format_pl_smallstr!("column_{i}"))
122
.collect(),
123
Some(cn) => match cn {
124
Either::Left(name) => {
125
let new_names = self.column(name.as_str()).and_then(|x| x.str())?;
126
polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values");
127
df = Cow::Owned(self.drop(name.as_str())?);
128
new_names
129
.into_no_null_iter()
130
.map(PlSmallStr::from_str)
131
.collect()
132
},
133
Either::Right(names) => {
134
polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
135
names
136
},
137
},
138
};
139
if let Some(cn) = keep_names_as {
140
// Check that the column name we're using for the original column names is unique before
141
// wasting time transposing
142
polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
143
}
144
polars_ensure!(
145
df.height() != 0 && df.width() != 0,
146
NoData: "unable to transpose an empty DataFrame"
147
);
148
let dtype = df.get_supertype().unwrap()?;
149
df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out)
150
}
151
}
152
153
#[inline]
154
unsafe fn add_value<T: NumericNative>(
155
values_buf_ptr: usize,
156
col_idx: usize,
157
row_idx: usize,
158
value: T,
159
) {
160
let vec_ref: &mut Vec<Vec<T>> = &mut *(values_buf_ptr as *mut Vec<Vec<T>>);
161
let column = vec_ref.get_unchecked_mut(col_idx);
162
let el_ptr = column.as_mut_ptr();
163
*el_ptr.add(row_idx) = value;
164
}
165
166
// This just fills a pre-allocated mutable series vector, which may have a name column.
167
// Nothing is returned and the actual DataFrame is constructed above.
168
pub(super) fn numeric_transpose<T: PolarsNumericType>(
169
cols: &[Column],
170
names_out: &[PlSmallStr],
171
cols_t: &mut Vec<Column>,
172
) {
173
let new_width = cols[0].len();
174
let new_height = cols.len();
175
176
let has_nulls = cols.iter().any(|s| s.null_count() > 0);
177
178
let mut values_buf: Vec<Vec<T::Native>> = (0..new_width)
179
.map(|_| Vec::with_capacity(new_height))
180
.collect();
181
let mut validity_buf: Vec<_> = if has_nulls {
182
// we first use bools instead of bits, because we can access these in parallel without aliasing
183
(0..new_width).map(|_| vec![true; new_height]).collect()
184
} else {
185
(0..new_width).map(|_| vec![]).collect()
186
};
187
188
// work with *mut pointers because we it is UB write to &refs.
189
let values_buf_ptr = &mut values_buf as *mut Vec<Vec<T::Native>> as usize;
190
let validity_buf_ptr = &mut validity_buf as *mut Vec<Vec<bool>> as usize;
191
192
POOL.install(|| {
193
cols.iter()
194
.map(Column::as_materialized_series)
195
.enumerate()
196
.for_each(|(row_idx, s)| {
197
let s = s.cast(&T::get_static_dtype()).unwrap();
198
let ca = s.unpack::<T>().unwrap();
199
200
// SAFETY:
201
// we access in parallel, but every access is unique, so we don't break aliasing rules
202
// we also ensured we allocated enough memory, so we never reallocate and thus
203
// the pointers remain valid.
204
if has_nulls {
205
for (col_idx, opt_v) in ca.iter().enumerate() {
206
match opt_v {
207
None => unsafe {
208
let validity_vec: &mut Vec<Vec<bool>> =
209
&mut *(validity_buf_ptr as *mut Vec<Vec<bool>>);
210
let column = validity_vec.get_unchecked_mut(col_idx);
211
let el_ptr = column.as_mut_ptr();
212
*el_ptr.add(row_idx) = false;
213
// we must initialize this memory otherwise downstream code
214
// might access uninitialized memory when the masked out values
215
// are changed.
216
add_value(values_buf_ptr, col_idx, row_idx, T::Native::default());
217
},
218
Some(v) => unsafe {
219
add_value(values_buf_ptr, col_idx, row_idx, v);
220
},
221
}
222
}
223
} else {
224
for (col_idx, v) in ca.into_no_null_iter().enumerate() {
225
unsafe {
226
let column: &mut Vec<Vec<T::Native>> =
227
&mut *(values_buf_ptr as *mut Vec<Vec<T::Native>>);
228
let el_ptr = column.get_unchecked_mut(col_idx).as_mut_ptr();
229
*el_ptr.add(row_idx) = v;
230
}
231
}
232
}
233
})
234
});
235
236
let par_iter = values_buf
237
.into_par_iter()
238
.zip(validity_buf)
239
.zip(names_out)
240
.map(|((mut values, validity), name)| {
241
// SAFETY:
242
// all values are written we can now set len
243
unsafe {
244
values.set_len(new_height);
245
}
246
247
let validity = if has_nulls {
248
let validity = Bitmap::from_trusted_len_iter(validity.iter().copied());
249
if validity.unset_bits() > 0 {
250
Some(validity)
251
} else {
252
None
253
}
254
} else {
255
None
256
};
257
258
let arr = PrimitiveArray::<T::Native>::new(
259
T::get_static_dtype().to_arrow(CompatLevel::newest()),
260
values.into(),
261
validity,
262
);
263
ChunkedArray::<T>::with_chunk(name.clone(), arr).into_column()
264
});
265
POOL.install(|| cols_t.par_extend(par_iter));
266
}
267
268
#[cfg(test)]
269
mod test {
270
use super::*;
271
272
#[test]
273
fn test_transpose() -> PolarsResult<()> {
274
let mut df = df![
275
"a" => [1, 2, 3],
276
"b" => [10, 20, 30],
277
]?;
278
279
let out = df.transpose(None, None)?;
280
let expected = df![
281
"column_0" => [1, 10],
282
"column_1" => [2, 20],
283
"column_2" => [3, 30],
284
285
]?;
286
assert!(out.equals_missing(&expected));
287
288
let mut df = df![
289
"a" => [Some(1), None, Some(3)],
290
"b" => [Some(10), Some(20), None],
291
]?;
292
let out = df.transpose(None, None)?;
293
let expected = df![
294
"column_0" => [1, 10],
295
"column_1" => [None, Some(20)],
296
"column_2" => [Some(3), None],
297
298
]?;
299
assert!(out.equals_missing(&expected));
300
301
let mut df = df![
302
"a" => ["a", "b", "c"],
303
"b" => [Some(10), Some(20), None],
304
]?;
305
let out = df.transpose(None, None)?;
306
let expected = df![
307
"column_0" => ["a", "10"],
308
"column_1" => ["b", "20"],
309
"column_2" => [Some("c"), None],
310
311
]?;
312
assert!(out.equals_missing(&expected));
313
Ok(())
314
}
315
}
316
317