Path: blob/main/crates/polars-core/src/frame/row/transpose.rs
8458 views
use std::borrow::Cow;12use either::Either;34use super::*;56impl DataFrame {7pub(crate) fn transpose_from_dtype(8&self,9dtype: &DataType,10keep_names_as: Option<PlSmallStr>,11names_out: &[PlSmallStr],12) -> PolarsResult<DataFrame> {13let new_width = self.height();14let new_height = self.width();15// Allocate space for the transposed columns, putting the "row names" first if needed16let mut cols_t = match keep_names_as {17None => Vec::<Column>::with_capacity(new_width),18Some(name) => {19let mut tmp = Vec::<Column>::with_capacity(new_width + 1);20tmp.push(21StringChunked::from_iter_values(22name,23self.get_column_names_owned().into_iter(),24)25.into_column(),26);27tmp28},29};3031let cols = self.columns();32match dtype {33#[cfg(feature = "dtype-i8")]34DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),35#[cfg(feature = "dtype-i16")]36DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),37DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),38DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),39#[cfg(feature = "dtype-u8")]40DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),41#[cfg(feature = "dtype-u16")]42DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),43DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),44DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),45DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),46DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),47#[cfg(feature = "object")]48DataType::Object(_) => {49// this requires to support `Object` in Series::iter which we don't yet50polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'")51},52_ => {53let phys_dtype = dtype.to_physical();54let mut buffers = (0..new_width)55.map(|_| {56let buf: AnyValueBufferTrusted = (&phys_dtype, new_height).into();57buf58})59.collect::<Vec<_>>();6061let columns = self62.materialized_column_iter()63// first cast to supertype before casting to physical to ensure units are correct64.map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap())65.collect::<Vec<_>>();6667// this is very expensive. A lot of cache misses here.68// This is the part that is performance critical.69for series in &columns {70polars_ensure!(71series.dtype() == &phys_dtype,72ComputeError: "cannot transpose with supertype: {}", dtype73);74for (av, buf) in series.iter().zip(buffers.iter_mut()) {75// SAFETY: we checked the type and we borrow76unsafe {77buf.add_unchecked_borrowed_physical(&av);78}79}80}81cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {82// SAFETY: we are casting back to the supertype83let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() };84s.rename(name.clone());85s.into()86}));87},88};8990DataFrame::new(new_height, cols_t)91}9293pub fn transpose(94&mut self,95keep_names_as: Option<&str>,96new_col_names: Option<Either<String, Vec<String>>>,97) -> PolarsResult<DataFrame> {98let new_col_names = match new_col_names {99None => None,100Some(Either::Left(v)) => Some(Either::Left(v.into())),101Some(Either::Right(v)) => Some(Either::Right(102v.into_iter().map(Into::into).collect::<Vec<_>>(),103)),104};105106self.transpose_impl(keep_names_as, new_col_names)107}108/// Transpose a DataFrame. This is a very expensive operation.109pub fn transpose_impl(110&mut self,111keep_names_as: Option<&str>,112new_col_names: Option<Either<PlSmallStr, Vec<PlSmallStr>>>,113) -> PolarsResult<DataFrame> {114// We must iterate columns as [`AnyValue`], so we must be contiguous.115self.rechunk_mut_par();116117let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column118let names_out = match new_col_names {119None => (0..self.height())120.map(|i| format_pl_smallstr!("column_{i}"))121.collect(),122Some(cn) => match cn {123Either::Left(name) => {124let new_names = self.column(name.as_str()).and_then(|x| x.str())?;125polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values");126df = Cow::Owned(self.drop(name.as_str())?);127new_names128.into_no_null_iter()129.map(PlSmallStr::from_str)130.collect()131},132Either::Right(names) => {133polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");134names135},136},137};138if let Some(cn) = keep_names_as {139// Check that the column name we're using for the original column names is unique before140// wasting time transposing141polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)142}143polars_ensure!(144df.height() != 0 && df.width() != 0,145NoData: "unable to transpose an empty DataFrame"146);147let dtype = df.get_supertype().unwrap()?;148df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out)149}150}151152#[inline]153unsafe fn add_value<T: NumericNative>(154values_buf_ptr: usize,155col_idx: usize,156row_idx: usize,157value: T,158) {159let vec_ref: &mut Vec<Vec<T>> = &mut *(values_buf_ptr as *mut Vec<Vec<T>>);160let column = vec_ref.get_unchecked_mut(col_idx);161let el_ptr = column.as_mut_ptr();162*el_ptr.add(row_idx) = value;163}164165// This just fills a pre-allocated mutable series vector, which may have a name column.166// Nothing is returned and the actual DataFrame is constructed above.167pub(super) fn numeric_transpose<T: PolarsNumericType>(168cols: &[Column],169names_out: &[PlSmallStr],170cols_t: &mut Vec<Column>,171) {172let new_width = cols[0].len();173let new_height = cols.len();174175let has_nulls = cols.iter().any(|s| s.null_count() > 0);176177let mut values_buf: Vec<Vec<T::Native>> = (0..new_width)178.map(|_| Vec::with_capacity(new_height))179.collect();180let mut validity_buf: Vec<_> = if has_nulls {181// we first use bools instead of bits, because we can access these in parallel without aliasing182(0..new_width).map(|_| vec![true; new_height]).collect()183} else {184(0..new_width).map(|_| vec![]).collect()185};186187// work with *mut pointers because we it is UB write to &refs.188let values_buf_ptr = &mut values_buf as *mut Vec<Vec<T::Native>> as usize;189let validity_buf_ptr = &mut validity_buf as *mut Vec<Vec<bool>> as usize;190191POOL.install(|| {192cols.iter()193.map(Column::as_materialized_series)194.enumerate()195.for_each(|(row_idx, s)| {196let s = s.cast(&T::get_static_dtype()).unwrap();197let ca = s.unpack::<T>().unwrap();198199// SAFETY:200// we access in parallel, but every access is unique, so we don't break aliasing rules201// we also ensured we allocated enough memory, so we never reallocate and thus202// the pointers remain valid.203if has_nulls {204for (col_idx, opt_v) in ca.iter().enumerate() {205match opt_v {206None => unsafe {207let validity_vec: &mut Vec<Vec<bool>> =208&mut *(validity_buf_ptr as *mut Vec<Vec<bool>>);209let column = validity_vec.get_unchecked_mut(col_idx);210let el_ptr = column.as_mut_ptr();211*el_ptr.add(row_idx) = false;212// we must initialize this memory otherwise downstream code213// might access uninitialized memory when the masked out values214// are changed.215add_value(values_buf_ptr, col_idx, row_idx, T::Native::default());216},217Some(v) => unsafe {218add_value(values_buf_ptr, col_idx, row_idx, v);219},220}221}222} else {223for (col_idx, v) in ca.into_no_null_iter().enumerate() {224unsafe {225let column: &mut Vec<Vec<T::Native>> =226&mut *(values_buf_ptr as *mut Vec<Vec<T::Native>>);227let el_ptr = column.get_unchecked_mut(col_idx).as_mut_ptr();228*el_ptr.add(row_idx) = v;229}230}231}232})233});234235let par_iter = values_buf236.into_par_iter()237.zip(validity_buf)238.zip(names_out)239.map(|((mut values, validity), name)| {240// SAFETY:241// all values are written we can now set len242unsafe {243values.set_len(new_height);244}245246let validity = if has_nulls {247let validity = Bitmap::from_trusted_len_iter(validity.iter().copied());248if validity.unset_bits() > 0 {249Some(validity)250} else {251None252}253} else {254None255};256257let arr = PrimitiveArray::<T::Native>::new(258T::get_static_dtype().to_arrow(CompatLevel::newest()),259values.into(),260validity,261);262ChunkedArray::<T>::with_chunk(name.clone(), arr).into_column()263});264POOL.install(|| cols_t.par_extend(par_iter));265}266267#[cfg(test)]268mod test {269use super::*;270271#[test]272fn test_transpose() -> PolarsResult<()> {273let mut df = df![274"a" => [1, 2, 3],275"b" => [10, 20, 30],276]?;277278let out = df.transpose(None, None)?;279let expected = df![280"column_0" => [1, 10],281"column_1" => [2, 20],282"column_2" => [3, 30],283284]?;285assert!(out.equals_missing(&expected));286287let mut df = df![288"a" => [Some(1), None, Some(3)],289"b" => [Some(10), Some(20), None],290]?;291let out = df.transpose(None, None)?;292let expected = df![293"column_0" => [1, 10],294"column_1" => [None, Some(20)],295"column_2" => [Some(3), None],296297]?;298assert!(out.equals_missing(&expected));299300let mut df = df![301"a" => ["a", "b", "c"],302"b" => [Some(10), Some(20), None],303]?;304let out = df.transpose(None, None)?;305let expected = df![306"column_0" => ["a", "10"],307"column_1" => ["b", "20"],308"column_2" => [Some("c"), None],309310]?;311assert!(out.equals_missing(&expected));312Ok(())313}314}315316317