Path: blob/main/crates/polars-core/src/frame/row/transpose.rs
6940 views
use std::borrow::Cow;12use either::Either;34use super::*;56impl DataFrame {7pub(crate) fn transpose_from_dtype(8&self,9dtype: &DataType,10keep_names_as: Option<PlSmallStr>,11names_out: &[PlSmallStr],12) -> PolarsResult<DataFrame> {13let new_width = self.height();14let new_height = self.width();15// Allocate space for the transposed columns, putting the "row names" first if needed16let mut cols_t = match keep_names_as {17None => Vec::<Column>::with_capacity(new_width),18Some(name) => {19let mut tmp = Vec::<Column>::with_capacity(new_width + 1);20tmp.push(21StringChunked::from_iter_values(22name,23self.get_column_names_owned().into_iter(),24)25.into_column(),26);27tmp28},29};3031let cols = &self.columns;32match dtype {33#[cfg(feature = "dtype-i8")]34DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),35#[cfg(feature = "dtype-i16")]36DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),37DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),38DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),39#[cfg(feature = "dtype-u8")]40DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),41#[cfg(feature = "dtype-u16")]42DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),43DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),44DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),45DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),46DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),47#[cfg(feature = "object")]48DataType::Object(_) => {49// this requires to support `Object` in Series::iter which we don't yet50polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'")51},52_ => {53let phys_dtype = dtype.to_physical();54let mut buffers = (0..new_width)55.map(|_| {56let buf: AnyValueBufferTrusted = (&phys_dtype, new_height).into();57buf58})59.collect::<Vec<_>>();6061let columns = self62.materialized_column_iter()63// first cast to supertype before casting to physical to ensure units are correct64.map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap())65.collect::<Vec<_>>();6667// this is very expensive. A lot of cache misses here.68// This is the part that is performance critical.69for s in columns {70polars_ensure!(s.dtype() == &phys_dtype, ComputeError: "cannot transpose with supertype: {}", dtype);71s.iter().zip(buffers.iter_mut()).for_each(|(av, buf)| {72// SAFETY: we checked the type and we borrow73unsafe {74buf.add_unchecked_borrowed_physical(&av);75}76});77}78cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {79// SAFETY: we are casting back to the supertype80let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() };81s.rename(name.clone());82s.into()83}));84},85};86Ok(unsafe { DataFrame::new_no_checks(new_height, cols_t) })87}8889pub fn transpose(90&mut self,91keep_names_as: Option<&str>,92new_col_names: Option<Either<String, Vec<String>>>,93) -> PolarsResult<DataFrame> {94let new_col_names = match new_col_names {95None => None,96Some(Either::Left(v)) => Some(Either::Left(v.into())),97Some(Either::Right(v)) => Some(Either::Right(98v.into_iter().map(Into::into).collect::<Vec<_>>(),99)),100};101102self.transpose_impl(keep_names_as, new_col_names)103}104/// Transpose a DataFrame. This is a very expensive operation.105pub fn transpose_impl(106&mut self,107keep_names_as: Option<&str>,108new_col_names: Option<Either<PlSmallStr, Vec<PlSmallStr>>>,109) -> PolarsResult<DataFrame> {110// We must iterate columns as [`AnyValue`], so we must be contiguous.111self.as_single_chunk_par();112113let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column114let names_out = match new_col_names {115None => (0..self.height())116.map(|i| format_pl_smallstr!("column_{i}"))117.collect(),118Some(cn) => match cn {119Either::Left(name) => {120let new_names = self.column(name.as_str()).and_then(|x| x.str())?;121polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values");122df = Cow::Owned(self.drop(name.as_str())?);123new_names124.into_no_null_iter()125.map(PlSmallStr::from_str)126.collect()127},128Either::Right(names) => {129polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");130names131},132},133};134if let Some(cn) = keep_names_as {135// Check that the column name we're using for the original column names is unique before136// wasting time transposing137polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)138}139polars_ensure!(140df.height() != 0 && df.width() != 0,141NoData: "unable to transpose an empty DataFrame"142);143let dtype = df.get_supertype().unwrap()?;144df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out)145}146}147148#[inline]149unsafe fn add_value<T: NumericNative>(150values_buf_ptr: usize,151col_idx: usize,152row_idx: usize,153value: T,154) {155let vec_ref: &mut Vec<Vec<T>> = &mut *(values_buf_ptr as *mut Vec<Vec<T>>);156let column = vec_ref.get_unchecked_mut(col_idx);157let el_ptr = column.as_mut_ptr();158*el_ptr.add(row_idx) = value;159}160161// This just fills a pre-allocated mutable series vector, which may have a name column.162// Nothing is returned and the actual DataFrame is constructed above.163pub(super) fn numeric_transpose<T: PolarsNumericType>(164cols: &[Column],165names_out: &[PlSmallStr],166cols_t: &mut Vec<Column>,167) {168let new_width = cols[0].len();169let new_height = cols.len();170171let has_nulls = cols.iter().any(|s| s.null_count() > 0);172173let mut values_buf: Vec<Vec<T::Native>> = (0..new_width)174.map(|_| Vec::with_capacity(new_height))175.collect();176let mut validity_buf: Vec<_> = if has_nulls {177// we first use bools instead of bits, because we can access these in parallel without aliasing178(0..new_width).map(|_| vec![true; new_height]).collect()179} else {180(0..new_width).map(|_| vec![]).collect()181};182183// work with *mut pointers because we it is UB write to &refs.184let values_buf_ptr = &mut values_buf as *mut Vec<Vec<T::Native>> as usize;185let validity_buf_ptr = &mut validity_buf as *mut Vec<Vec<bool>> as usize;186187POOL.install(|| {188cols.iter()189.map(Column::as_materialized_series)190.enumerate()191.for_each(|(row_idx, s)| {192let s = s.cast(&T::get_static_dtype()).unwrap();193let ca = s.unpack::<T>().unwrap();194195// SAFETY:196// we access in parallel, but every access is unique, so we don't break aliasing rules197// we also ensured we allocated enough memory, so we never reallocate and thus198// the pointers remain valid.199if has_nulls {200for (col_idx, opt_v) in ca.iter().enumerate() {201match opt_v {202None => unsafe {203let validity_vec: &mut Vec<Vec<bool>> =204&mut *(validity_buf_ptr as *mut Vec<Vec<bool>>);205let column = validity_vec.get_unchecked_mut(col_idx);206let el_ptr = column.as_mut_ptr();207*el_ptr.add(row_idx) = false;208// we must initialize this memory otherwise downstream code209// might access uninitialized memory when the masked out values210// are changed.211add_value(values_buf_ptr, col_idx, row_idx, T::Native::default());212},213Some(v) => unsafe {214add_value(values_buf_ptr, col_idx, row_idx, v);215},216}217}218} else {219for (col_idx, v) in ca.into_no_null_iter().enumerate() {220unsafe {221let column: &mut Vec<Vec<T::Native>> =222&mut *(values_buf_ptr as *mut Vec<Vec<T::Native>>);223let el_ptr = column.get_unchecked_mut(col_idx).as_mut_ptr();224*el_ptr.add(row_idx) = v;225}226}227}228})229});230231let par_iter = values_buf232.into_par_iter()233.zip(validity_buf)234.zip(names_out)235.map(|((mut values, validity), name)| {236// SAFETY:237// all values are written we can now set len238unsafe {239values.set_len(new_height);240}241242let validity = if has_nulls {243let validity = Bitmap::from_trusted_len_iter(validity.iter().copied());244if validity.unset_bits() > 0 {245Some(validity)246} else {247None248}249} else {250None251};252253let arr = PrimitiveArray::<T::Native>::new(254T::get_static_dtype().to_arrow(CompatLevel::newest()),255values.into(),256validity,257);258ChunkedArray::<T>::with_chunk(name.clone(), arr).into_column()259});260POOL.install(|| cols_t.par_extend(par_iter));261}262263#[cfg(test)]264mod test {265use super::*;266267#[test]268fn test_transpose() -> PolarsResult<()> {269let mut df = df![270"a" => [1, 2, 3],271"b" => [10, 20, 30],272]?;273274let out = df.transpose(None, None)?;275let expected = df![276"column_0" => [1, 10],277"column_1" => [2, 20],278"column_2" => [3, 30],279280]?;281assert!(out.equals_missing(&expected));282283let mut df = df![284"a" => [Some(1), None, Some(3)],285"b" => [Some(10), Some(20), None],286]?;287let out = df.transpose(None, None)?;288let expected = df![289"column_0" => [1, 10],290"column_1" => [None, Some(20)],291"column_2" => [Some(3), None],292293]?;294assert!(out.equals_missing(&expected));295296let mut df = df![297"a" => ["a", "b", "c"],298"b" => [Some(10), Some(20), None],299]?;300let out = df.transpose(None, None)?;301let expected = df![302"column_0" => ["a", "10"],303"column_1" => ["b", "20"],304"column_2" => [Some("c"), None],305306]?;307assert!(out.equals_missing(&expected));308Ok(())309}310}311312313