Path: blob/main/crates/polars-ops/src/series/ops/various.rs
6939 views
use num_traits::Bounded;1#[cfg(feature = "dtype-struct")]2use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;3use polars_core::prelude::arity::unary_elementwise_values;4use polars_core::prelude::*;5use polars_core::series::IsSorted;6use polars_core::with_match_physical_numeric_polars_type;7#[cfg(feature = "hash")]8use polars_utils::aliases::PlSeedableRandomStateQuality;9use polars_utils::total_ord::TotalOrd;1011use crate::series::ops::SeriesSealed;1213pub trait SeriesMethods: SeriesSealed {14/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`15/// with dtype [`IdxType`]16fn value_counts(17&self,18sort: bool,19parallel: bool,20name: PlSmallStr,21normalize: bool,22) -> PolarsResult<DataFrame> {23let s = self.as_series();24polars_ensure!(25s.name() != &name,26Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \27column names; change `name` to fix", name,28);29// we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined30let groups = s.group_tuples(parallel, sort)?;31let values = unsafe { s.agg_first(&groups) }32.with_name(s.name().clone())33.into();34let counts = groups.group_count().with_name(name.clone());3536let counts = if normalize {37let len = s.len() as f64;38let counts: Float64Chunked =39unary_elementwise_values(&counts, |count| count as f64 / len);40counts.into_column()41} else {42counts.into_column()43};4445let height = counts.len();46let cols = vec![values, counts];47let df = unsafe { DataFrame::new_no_checks(height, cols) };48if sort {49df.sort(50[name],51SortMultipleOptions::default()52.with_order_descending(true)53.with_multithreaded(parallel),54)55} else {56Ok(df)57}58}5960#[cfg(feature = "hash")]61fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {62let s = self.as_series().to_physical_repr();63match s.dtype() {64DataType::List(_) => {65let mut ca = s.list().unwrap().clone();66crate::chunked_array::hash::hash(&mut ca, build_hasher)67},68_ => {69let mut h = vec![];70s.0.vec_hash(build_hasher, &mut h).unwrap();71UInt64Chunked::from_vec(s.name().clone(), h)72},73}74}7576fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {77polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);78Ok(())79}8081/// Checks if a [`Series`] is sorted. Tries to fail fast.82fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {83let s = self.as_series();84let null_count = s.null_count();8586// fast paths87if (options.descending88&& (options.nulls_last || null_count == 0)89&& matches!(s.is_sorted_flag(), IsSorted::Descending))90|| (!options.descending91&& (!options.nulls_last || null_count == 0)92&& matches!(s.is_sorted_flag(), IsSorted::Ascending))93{94return Ok(true);95}9697// for struct types we row-encode and recurse98#[cfg(feature = "dtype-struct")]99if matches!(s.dtype(), DataType::Struct(_)) {100let encoded = _get_rows_encoded_ca(101PlSmallStr::EMPTY,102&[s.clone().into()],103&[options.descending],104&[options.nulls_last],105)?;106return encoded.into_series().is_sorted(options);107}108109let s_len = s.len();110if null_count == s_len {111// All nulls is all equal112return Ok(true);113}114// Check if nulls are in the right location.115if null_count > 0 {116// The slice triggers a fast null count117if options.nulls_last {118if s.slice((s_len - null_count) as i64, null_count)119.null_count()120!= null_count121{122return Ok(false);123}124} else if s.slice(0, null_count).null_count() != null_count {125return Ok(false);126}127}128129if s.dtype().is_primitive_numeric() {130with_match_physical_numeric_polars_type!(s.dtype(), |$T| {131let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();132return Ok(is_sorted_ca_num::<$T>(ca, options))133})134}135136let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do137// TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.138// Compare adjacent elements with no-copy slices that don't include any nulls139let offset = !options.nulls_last as i64 * null_count as i64;140let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));141let cmp_op = if options.descending {142Series::gt_eq143} else {144Series::lt_eq145};146Ok(cmp_op(&s1, &s2)?.all())147}148}149150fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(151vals: &[T],152f: Cmp,153previous: &mut T,154) -> bool {155let mut sorted = true;156157// Outer loop so we can fail fast158// Inner loop will auto vectorize159for c in vals.chunks(1024) {160// don't early stop or branch161// so it autovectorizes162for v in c {163sorted &= f(previous, v);164*previous = *v;165}166if !sorted {167return false;168}169}170sorted171}172173// Assumes nulls last/first is already checked.174fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {175if let Ok(vals) = ca.cont_slice() {176let mut previous = vals[0];177return if options.descending {178check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)179} else {180check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)181};182};183184if ca.null_count() == 0 {185let mut previous = if options.descending {186T::Native::max_value()187} else {188T::Native::min_value()189};190for arr in ca.downcast_iter() {191let vals = arr.values();192193let sorted = if options.descending {194check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)195} else {196check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)197};198if !sorted {199return false;200}201}202return true;203};204205// Slice off nulls and recurse.206let null_count = ca.null_count();207if options.nulls_last {208let ca = ca.slice(0, ca.len() - null_count);209is_sorted_ca_num(&ca, options)210} else {211let ca = ca.slice(null_count as i64, ca.len() - null_count);212is_sorted_ca_num(&ca, options)213}214}215216impl SeriesMethods for Series {}217218219