Path: blob/main/crates/polars-compute/src/find_validity_mismatch.rs
6939 views
use arrow::array::{Array, FixedSizeListArray, ListArray, StructArray};1use arrow::datatypes::ArrowDataType;2use arrow::types::Offset;3use polars_utils::IdxSize;4use polars_utils::itertools::Itertools;56use crate::cast::CastOptionsImpl;78/// Find the indices of the values where the validity mismatches.9///10/// This is done recursively, meaning that a validity mismatch at a deeper level will result as at11/// the level above at the corresponding index.12///13/// This procedure requires that14/// - Nulls are propagated recursively15/// - Lists to be16/// - trimmed to normalized offsets17/// - have the same number of child elements below each element (even nulls)18pub fn find_validity_mismatch(left: &dyn Array, right: &dyn Array, idxs: &mut Vec<IdxSize>) {19assert_eq!(left.len(), right.len());2021// Handle the top-level.22//23// NOTE: This is done always, even if left and right have different nestings. This is24// intentional and needed.25let original_idxs_length = idxs.len();26match (left.validity(), right.validity()) {27(None, None) => {},28(Some(l), Some(r)) => {29if l != r {30let mismatches = arrow::bitmap::xor(l, r);31idxs.extend(mismatches.true_idx_iter().map(|i| i as IdxSize));32}33},34(Some(v), _) | (_, Some(v)) => {35if v.unset_bits() > 0 {36let mismatches = !v;37idxs.extend(mismatches.true_idx_iter().map(|i| i as IdxSize));38}39},40}4142let left = left.as_any();43let right = right.as_any();4445let pre_nesting_length = idxs.len();46// (Struct, Struct)47if let (Some(left), Some(right)) = (48left.downcast_ref::<StructArray>(),49right.downcast_ref::<StructArray>(),50) {51assert_eq!(left.fields().len(), right.fields().len());52for (l, r) in left.values().iter().zip(right.values().iter()) {53find_validity_mismatch(l.as_ref(), r.as_ref(), idxs);54}55}5657// (List, List)58if let (Some(left), Some(right)) = (59left.downcast_ref::<ListArray<i32>>(),60right.downcast_ref::<ListArray<i32>>(),61) {62find_validity_mismatch_list_list_nested(left, right, idxs);63}64if let (Some(left), Some(right)) = (65left.downcast_ref::<ListArray<i64>>(),66right.downcast_ref::<ListArray<i64>>(),67) {68find_validity_mismatch_list_list_nested(left, right, idxs);69}7071// (FixedSizeList, FixedSizeList)72if let (Some(left), Some(right)) = (73left.downcast_ref::<FixedSizeListArray>(),74right.downcast_ref::<FixedSizeListArray>(),75) {76assert_eq!(left.size(), right.size());77find_validity_mismatch_fsl_fsl_nested(78left.values().as_ref(),79right.values().as_ref(),80left.size(),81idxs,82)83}8485// (List, Array) / (Array, List)86if let (Some(left), Some(right)) = (87left.downcast_ref::<ListArray<i32>>(),88right.downcast_ref::<FixedSizeListArray>(),89) {90find_validity_mismatch_list_fsl_impl(left, right, idxs);91}92if let (Some(left), Some(right)) = (93left.downcast_ref::<ListArray<i64>>(),94right.downcast_ref::<FixedSizeListArray>(),95) {96find_validity_mismatch_list_fsl_impl(left, right, idxs);97}98if let (Some(right), Some(left)) = (99left.downcast_ref::<FixedSizeListArray>(),100right.downcast_ref::<ListArray<i32>>(),101) {102find_validity_mismatch_list_fsl_impl(left, right, idxs);103}104if let (Some(right), Some(left)) = (105left.downcast_ref::<FixedSizeListArray>(),106right.downcast_ref::<ListArray<i64>>(),107) {108find_validity_mismatch_list_fsl_impl(left, right, idxs);109}110111if pre_nesting_length == idxs.len() {112return;113}114idxs[original_idxs_length..].sort_unstable();115}116117fn find_validity_mismatch_fsl_fsl_nested(118left: &dyn Array,119right: &dyn Array,120size: usize,121idxs: &mut Vec<IdxSize>,122) {123assert_eq!(left.len(), right.len());124let start_length = idxs.len();125find_validity_mismatch(left, right, idxs);126if idxs.len() > start_length {127let mut offset = 0;128idxs[start_length] /= size as IdxSize;129for i in start_length + 1..idxs.len() {130idxs[i - offset] = idxs[i] / size as IdxSize;131132if idxs[i - offset] == idxs[i - offset - 1] {133offset += 1;134}135}136idxs.truncate(idxs.len() - offset);137}138}139140fn find_validity_mismatch_list_list_nested<O: Offset>(141left: &ListArray<O>,142right: &ListArray<O>,143idxs: &mut Vec<IdxSize>,144) {145let mut nested_idxs = Vec::new();146find_validity_mismatch(147left.values().as_ref(),148right.values().as_ref(),149&mut nested_idxs,150);151152if nested_idxs.is_empty() {153return;154}155156assert_eq!(left.offsets().first().to_usize(), 0);157assert_eq!(left.offsets().range().to_usize(), left.values().len());158159// @TODO: Optimize. This is only used on the error path so it is find, right?160let mut j = 0;161for (i, (start, length)) in left.offsets().offset_and_length_iter().enumerate_idx() {162if j < nested_idxs.len() && (nested_idxs[j] as usize) < start + length {163idxs.push(i);164j += 1;165166// Loop over remaining items in same element.167while j < nested_idxs.len() && (nested_idxs[j] as usize) < start + length {168j += 1;169}170}171172if j == nested_idxs.len() {173break;174}175}176}177178fn find_validity_mismatch_list_fsl_impl<O: Offset>(179left: &ListArray<O>,180right: &FixedSizeListArray,181idxs: &mut Vec<IdxSize>,182) {183if left.validity().is_none() && right.validity().is_none() {184find_validity_mismatch_fsl_fsl_nested(185left.values().as_ref(),186right.values().as_ref(),187right.size(),188idxs,189);190return;191}192193let (ArrowDataType::List(f) | ArrowDataType::LargeList(f)) = left.dtype() else {194unreachable!();195};196let left = crate::cast::cast_list_to_fixed_size_list(197left,198f,199right.size(),200CastOptionsImpl::default(),201)202.unwrap();203find_validity_mismatch_fsl_fsl_nested(204left.values().as_ref(),205right.values().as_ref(),206right.size(),207idxs,208)209}210211212