feat: LastCacheExec to track predicate pushdown in last cache queries (#25621)

2024-12-06 10:53:19 -08:00 · 2024-12-06 10:53:19 -08:00 · 154ff7da23
parent 9b87cd7a65
commit 154ff7da23
6 changed files with 807 additions and 259 deletions
--- a/influxdb3_cache/src/last_cache/cache.rs
+++ b/influxdb3_cache/src/last_cache/cache.rs
@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet, VecDeque},
+    collections::{BTreeSet, HashMap, HashSet, VecDeque},
    ops::Deref,
    sync::Arc,
    time::{Duration, Instant},
@ -17,11 +17,6 @@ use arrow::{
    },
    error::ArrowError,
 };
-use datafusion::{
-    logical_expr::{expr::InList, BinaryExpr, Operator},
-    prelude::Expr,
-    scalar::ScalarValue,
-};
 use indexmap::{IndexMap, IndexSet};
 use influxdb3_catalog::catalog::{ColumnDefinition, TableDefinition, TIME_COLUMN_NAME};
 use influxdb3_id::{ColumnId, TableId};
@ -56,8 +51,6 @@ pub(crate) struct LastCache {
    /// map preserves the order of the elements, thereby maintaining the order of the keys in
    /// the cache.
    pub(crate) key_column_ids: Arc<IndexSet<ColumnId>>,
-    /// The key columns for this cache, by their names
-    pub(crate) key_column_name_to_ids: Arc<HashMap<Arc<str>, ColumnId>>,
    /// The value columns for this cache
    pub(crate) value_columns: ValueColumnType,
    /// The Arrow Schema for the table that this cache is associated with
@ -86,6 +79,7 @@ pub struct CreateLastCacheArgs {
 /// The default cache time-to-live (TTL) is 4 hours
 pub(crate) const DEFAULT_CACHE_TTL: Duration = Duration::from_secs(60 * 60 * 4);

+/// The time to live (TTL) for entries in the cache
 #[derive(Debug, Clone, Copy, Deserialize)]
 pub struct LastCacheTtl(Duration);

@ -115,6 +109,7 @@ impl Default for LastCacheTtl {
    }
 }

+/// Specifies the key column configuration for a new [`LastCache`]
 #[derive(Debug, Default, Clone)]
 pub enum LastCacheKeyColumnsArg {
    /// Use the series key columns in their order as the last cache key columns
@ -133,6 +128,7 @@ impl From<Option<Vec<ColumnId>>> for LastCacheKeyColumnsArg {
    }
 }

+/// Specifies the value column configuration for a new [`LastCache`]
 #[derive(Debug, Default, Clone)]
 pub enum LastCacheValueColumnsArg {
    /// Use all non-key columns as value columns when initialized, and add new field columns that
@ -156,6 +152,10 @@ impl From<Option<Vec<ColumnId>>> for LastCacheValueColumnsArg {

 impl LastCache {
    /// Create a new [`LastCache`]
+    ///
+    /// The uses the provided `TableDefinition` to build an arrow schema for the cache. This will
+    /// validate the given arguments and can error if there are invalid columns specified, or if a
+    /// non-compatible column is used as a key to the cache.
    pub(crate) fn new(
        CreateLastCacheArgs {
            table_def,
@ -260,7 +260,6 @@ impl LastCache {
            count,
            ttl: ttl.into(),
            key_column_ids: Arc::new(key_column_ids),
-            key_column_name_to_ids: Arc::new(key_column_name_to_ids),
            value_columns: match value_columns {
                LastCacheValueColumnsArg::AcceptNew => ValueColumnType::AcceptNew { seen },
                LastCacheValueColumnsArg::Explicit(_) => ValueColumnType::Explicit {
@ -340,32 +339,36 @@ impl LastCache {
    /// This will panic if the internal cache state's keys are out-of-order with respect to the
    /// order of the `key_columns` on this [`LastCache`]
    pub(crate) fn push(&mut self, row: &Row, table_def: Arc<TableDefinition>) {
+        let mut values = Vec::with_capacity(self.key_column_ids.len());
+        for id in self.key_column_ids.iter() {
+            let Some(value) = row
+                .fields
+                .iter()
+                .find(|f| &f.id == id)
+                .map(|f| KeyValue::from(&f.value))
+            else {
+                // ignore the row if it does not contain all key columns
+                return;
+            };
+            values.push(value);
+        }
        let accept_new_fields = self.accept_new_fields();
        let mut target = &mut self.state;
-        let mut key_iter = self.key_column_ids.iter().peekable();
-        while let (Some(col_id), peek) = (key_iter.next(), key_iter.peek()) {
+        let mut iter = self.key_column_ids.iter().zip(values).peekable();
+        while let (Some((col_id, value)), peek) = (iter.next(), iter.peek()) {
            if target.is_init() {
                *target = LastCacheState::Key(LastCacheKey {
                    column_id: *col_id,
                    value_map: Default::default(),
                });
            }
-            let Some(value) = row
-                .fields
-                .iter()
-                .find(|f| f.id == *col_id)
-                .map(|f| KeyValue::from(&f.value))
-            else {
-                // ignore the row if it does not contain all key columns
-                return;
-            };
            let cache_key = target.as_key_mut().unwrap();
            assert_eq!(
                &cache_key.column_id, col_id,
                "key columns must match cache key order"
            );
            target = cache_key.value_map.entry(value).or_insert_with(|| {
-                if let Some(next_col_id) = peek {
+                if let Some((next_col_id, _)) = peek {
                    LastCacheState::Key(LastCacheKey {
                        column_id: **next_col_id,
                        value_map: Default::default(),
@ -411,14 +414,14 @@ impl LastCache {
    pub(crate) fn to_record_batches(
        &self,
        table_def: Arc<TableDefinition>,
-        predicates: &[Predicate],
+        predicates: &IndexMap<ColumnId, Predicate>,
    ) -> Result<Vec<RecordBatch>, ArrowError> {
        // map the provided predicates on to the key columns
        // there may not be predicates provided for each key column, hence the Option
        let predicates: Vec<Option<&Predicate>> = self
            .key_column_ids
            .iter()
-            .map(|col_id| predicates.iter().find(|p| p.column_id == *col_id))
+            .map(|id| predicates.get(id))
            .collect();

        let mut caches = vec![ExtendedLastCacheState {
@ -465,78 +468,6 @@ impl LastCache {
            .collect()
    }

-    /// Convert a set of DataFusion filter [`Expr`]s into [`Predicate`]s
-    ///
-    /// This only handles binary expressions, e.g., `foo = 'bar'`, and will use the `key_columns`
-    /// to filter out expressions that do not match key columns in the cache.
-    pub(crate) fn convert_filter_exprs(&self, exprs: &[Expr]) -> Vec<Predicate> {
-        exprs
-            .iter()
-            .filter_map(|expr| {
-                match expr {
-                    Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
-                        let col_id = if let Expr::Column(c) = left.as_ref() {
-                            self.key_column_name_to_ids.get(c.name()).copied()?
-                        } else {
-                            return None;
-                        };
-                        let value = match right.as_ref() {
-                            Expr::Literal(ScalarValue::Utf8(Some(v))) => {
-                                KeyValue::String(v.to_owned())
-                            }
-                            Expr::Literal(ScalarValue::Boolean(Some(v))) => KeyValue::Bool(*v),
-                            // TODO: handle integer types that can be casted up to i64/u64:
-                            Expr::Literal(ScalarValue::Int64(Some(v))) => KeyValue::Int(*v),
-                            Expr::Literal(ScalarValue::UInt64(Some(v))) => KeyValue::UInt(*v),
-                            _ => return None,
-                        };
-                        match op {
-                            Operator::Eq => Some(Predicate::new_eq(col_id, value)),
-                            Operator::NotEq => Some(Predicate::new_not_eq(col_id, value)),
-                            _ => None,
-                        }
-                    }
-                    Expr::InList(InList {
-                        expr,
-                        list,
-                        negated,
-                    }) => {
-                        let col_id = if let Expr::Column(c) = expr.as_ref() {
-                            self.key_column_name_to_ids.get(c.name()).copied()?
-                        } else {
-                            return None;
-                        };
-                        let values: Vec<KeyValue> = list
-                            .iter()
-                            .filter_map(|e| match e {
-                                Expr::Literal(ScalarValue::Utf8(Some(v))) => {
-                                    Some(KeyValue::String(v.to_owned()))
-                                }
-                                Expr::Literal(ScalarValue::Boolean(Some(v))) => {
-                                    Some(KeyValue::Bool(*v))
-                                }
-                                // TODO: handle integer types that can be casted up to i64/u64:
-                                Expr::Literal(ScalarValue::Int64(Some(v))) => {
-                                    Some(KeyValue::Int(*v))
-                                }
-                                Expr::Literal(ScalarValue::UInt64(Some(v))) => {
-                                    Some(KeyValue::UInt(*v))
-                                }
-                                _ => None,
-                            })
-                            .collect();
-                        if *negated {
-                            Some(Predicate::new_not_in(col_id, values))
-                        } else {
-                            Some(Predicate::new_in(col_id, values))
-                        }
-                    }
-                    _ => None,
-                }
-            })
-            .collect()
-    }
-
    /// Remove expired values from the internal cache state
    pub(crate) fn remove_expired(&mut self) {
        self.state.remove_expired();
@ -663,50 +594,52 @@ impl ExtendedLastCacheState<'_> {
 }

 /// A predicate used for evaluating key column values in the cache on query
+///
+/// Can either be an inclusive set or exclusive set. `BTreeSet` is used to
+/// have the predicate values odered and displayed in a sorted order in
+/// query `EXPLAIN` plans.
 #[derive(Debug, Clone)]
-pub struct Predicate {
-    /// The left-hand-side of the predicate as a valid `ColumnId`
-    column_id: ColumnId,
-    /// The right-hand-side of the predicate
-    kind: PredicateKind,
+pub(crate) enum Predicate {
+    In(BTreeSet<KeyValue>),
+    NotIn(BTreeSet<KeyValue>),
+}
+
+impl std::fmt::Display for Predicate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Predicate::In(_) => write!(f, "IN (")?,
+            Predicate::NotIn(_) => write!(f, "NOT IN (")?,
+        }
+        let mut values = self.values();
+        while let Some(v) = values.next() {
+            write!(f, "{v}")?;
+            if values.size_hint().0 > 0 {
+                write!(f, ",")?;
+            }
+        }
+
+        write!(f, ")")
+    }
 }

 impl Predicate {
-    pub(crate) fn new_eq(column_id: ColumnId, value: KeyValue) -> Self {
-        Self {
-            column_id,
-            kind: PredicateKind::Eq(value),
-        }
-    }
-
-    pub(crate) fn new_not_eq(column_id: ColumnId, value: KeyValue) -> Self {
-        Self {
-            column_id,
-            kind: PredicateKind::NotEq(value),
-        }
-    }
-
-    pub(crate) fn new_in(column_id: ColumnId, values: Vec<KeyValue>) -> Self {
-        Self {
-            column_id,
-            kind: PredicateKind::In(values),
-        }
-    }
-
-    pub(crate) fn new_not_in(column_id: ColumnId, values: Vec<KeyValue>) -> Self {
-        Self {
-            column_id,
-            kind: PredicateKind::NotIn(values),
+    fn values(&self) -> impl Iterator<Item = &KeyValue> {
+        match self {
+            Predicate::In(btree_set) => btree_set.iter(),
+            Predicate::NotIn(btree_set) => btree_set.iter(),
        }
    }
 }

-#[derive(Debug, Clone)]
-pub(crate) enum PredicateKind {
-    Eq(KeyValue),
-    NotEq(KeyValue),
-    In(Vec<KeyValue>),
-    NotIn(Vec<KeyValue>),
+#[cfg(test)]
+impl Predicate {
+    pub(crate) fn new_in(values: impl IntoIterator<Item = KeyValue>) -> Self {
+        Self::In(values.into_iter().collect())
+    }
+
+    pub(crate) fn new_not_in(values: impl IntoIterator<Item = KeyValue>) -> Self {
+        Self::NotIn(values.into_iter().collect())
+    }
 }

 /// Represents the hierarchical last cache structure
@ -777,37 +710,16 @@ struct LastCacheKey {
 impl LastCacheKey {
    /// Evaluate the provided [`Predicate`] by using its value to lookup in this [`LastCacheKey`]'s
    /// value map.
-    ///
-    /// # Panics
-    ///
-    /// This assumes that a predicate for this [`LastCacheKey`]'s column was provided, and will panic
-    /// otherwise.
    fn evaluate_predicate<'a: 'b, 'b>(
        &'a self,
        predicate: &'b Predicate,
    ) -> Vec<(&'a LastCacheState, &'b KeyValue)> {
-        if predicate.column_id != self.column_id {
-            panic!(
-                "attempted to evaluate unexpected predicate with key {} for column with id {}",
-                predicate.column_id, self.column_id
-            );
-        }
-        match &predicate.kind {
-            PredicateKind::Eq(val) => self
-                .value_map
-                .get(val)
-                .map(|s| vec![(s, val)])
-                .unwrap_or_default(),
-            PredicateKind::NotEq(val) => self
-                .value_map
-                .iter()
-                .filter_map(|(v, s)| (v != val).then_some((s, v)))
-                .collect(),
-            PredicateKind::In(vals) => vals
+        match predicate {
+            Predicate::In(vals) => vals
                .iter()
                .filter_map(|v| self.value_map.get(v).map(|s| (s, v)))
                .collect(),
-            PredicateKind::NotIn(vals) => self
+            Predicate::NotIn(vals) => self
                .value_map
                .iter()
                .filter_map(|(v, s)| (!vals.contains(v)).then_some((s, v)))
@ -828,7 +740,7 @@ impl LastCacheKey {
 }

 /// A value for a key column in a [`LastCache`]
-#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
 pub(crate) enum KeyValue {
    String(String),
    Int(i64),
@ -836,6 +748,17 @@ pub(crate) enum KeyValue {
    Bool(bool),
 }

+impl std::fmt::Display for KeyValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            KeyValue::String(s) => write!(f, "'{s}'"),
+            KeyValue::Int(i) => write!(f, "{i}i"),
+            KeyValue::UInt(u) => write!(f, "{u}u"),
+            KeyValue::Bool(b) => write!(f, "{b}"),
+        }
+    }
+}
+
 #[cfg(test)]
 impl KeyValue {
    pub(crate) fn string(s: impl Into<String>) -> Self {
--- a/influxdb3_cache/src/last_cache/mod.rs
+++ b/influxdb3_cache/src/last_cache/mod.rs
@ -8,7 +8,7 @@ mod provider;
 pub use provider::LastCacheProvider;
 mod table_function;
 use schema::InfluxColumnType;
-pub use table_function::LastCacheFunction;
+pub use table_function::{LastCacheFunction, LAST_CACHE_UDTF_NAME};

 #[derive(Debug, thiserror::Error)]
 pub enum Error {
@ -44,8 +44,11 @@ impl Error {
 mod tests {
    use std::{cmp::Ordering, sync::Arc, thread, time::Duration};

+    use arrow::array::AsArray;
    use arrow_util::{assert_batches_eq, assert_batches_sorted_eq};
    use bimap::BiHashMap;
+    use datafusion::prelude::SessionContext;
+    use indexmap::IndexMap;
    use influxdb3_catalog::catalog::{Catalog, DatabaseSchema, TableDefinition};
    use influxdb3_id::{ColumnId, DbId, SerdeVecMap, TableId};
    use influxdb3_wal::{LastCacheDefinition, LastCacheSize};
@ -56,13 +59,19 @@ mod tests {
                KeyValue, LastCache, LastCacheKeyColumnsArg, LastCacheValueColumnsArg, Predicate,
                DEFAULT_CACHE_TTL,
            },
-            CreateLastCacheArgs, LastCacheProvider,
+            CreateLastCacheArgs, LastCacheFunction, LastCacheProvider, LAST_CACHE_UDTF_NAME,
        },
        test_helpers::{column_ids_for_names, TestWriter},
    };

    use super::LastCacheTtl;

+    fn predicates(
+        preds: impl IntoIterator<Item = (ColumnId, Predicate)>,
+    ) -> IndexMap<ColumnId, Predicate> {
+        preds.into_iter().collect()
+    }
+
    #[test]
    fn pick_up_latest_write() {
        let writer = TestWriter::new();
@ -88,11 +97,11 @@ mod tests {
            cache.push(row, Arc::clone(&table_def));
        }

-        let predicates = &[Predicate::new_eq(col_id, KeyValue::string("a"))];
+        let predicates = predicates([(col_id, Predicate::new_in([KeyValue::string("a")]))]);

        // Check what is in the last cache:
        let batch = cache
-            .to_record_batches(Arc::clone(&table_def), predicates)
+            .to_record_batches(Arc::clone(&table_def), &predicates)
            .unwrap();

        assert_batches_eq!(
@ -112,7 +121,7 @@ mod tests {
            cache.push(row, Arc::clone(&table_def));
        }

-        let batch = cache.to_record_batches(table_def, predicates).unwrap();
+        let batch = cache.to_record_batches(table_def, &predicates).unwrap();

        assert_batches_eq!(
            [
@ -177,17 +186,17 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            // Predicate including both key columns only produces value columns from the cache
            TestCase {
-                predicates: &[
-                    Predicate::new_eq(region_col_id, KeyValue::string("us")),
-                    Predicate::new_eq(host_col_id, KeyValue::string("c")),
-                ],
+                predicates: predicates([
+                    (region_col_id, Predicate::new_in([KeyValue::string("us")])),
+                    (host_col_id, Predicate::new_in([KeyValue::string("c")])),
+                ]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -199,7 +208,10 @@ mod tests {
            // Predicate on only region key column will have host column outputted in addition to
            // the value columns:
            TestCase {
-                predicates: &[Predicate::new_eq(region_col_id, KeyValue::string("us"))],
+                predicates: predicates([(
+                    region_col_id,
+                    Predicate::new_in([KeyValue::string("us")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -212,7 +224,10 @@ mod tests {
            },
            // Similar to previous, with a different region predicate:
            TestCase {
-                predicates: &[Predicate::new_eq(region_col_id, KeyValue::string("ca"))],
+                predicates: predicates([(
+                    region_col_id,
+                    Predicate::new_in([KeyValue::string("ca")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -226,7 +241,7 @@ mod tests {
            // Predicate on only host key column will have region column outputted in addition to
            // the value columns:
            TestCase {
-                predicates: &[Predicate::new_eq(host_col_id, KeyValue::string("a"))],
+                predicates: predicates([(host_col_id, Predicate::new_in([KeyValue::string("a")]))]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -238,7 +253,7 @@ mod tests {
            // Omitting all key columns from the predicate will have all key columns included in
            // the query result:
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -255,10 +270,10 @@ mod tests {
            // Using a non-existent key column as a predicate has no effect:
            // TODO: should this be an error?
            TestCase {
-                predicates: &[Predicate::new_eq(
+                predicates: predicates([(
                    ColumnId::new(),
-                    KeyValue::string("12345"),
-                )],
+                    Predicate::new_in([KeyValue::string("12345")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -274,31 +289,37 @@ mod tests {
            },
            // Using a non existent key column value yields empty result set:
            TestCase {
-                predicates: &[Predicate::new_eq(region_col_id, KeyValue::string("eu"))],
+                predicates: predicates([(
+                    region_col_id,
+                    Predicate::new_in([KeyValue::string("eu")]),
+                )]),
                expected: &["++", "++"],
            },
            // Using an invalid combination of key column values yields an empty result set:
            TestCase {
-                predicates: &[
-                    Predicate::new_eq(region_col_id, KeyValue::string("ca")),
-                    Predicate::new_eq(host_col_id, KeyValue::string("a")),
-                ],
+                predicates: predicates([
+                    (region_col_id, Predicate::new_in([KeyValue::string("ca")])),
+                    (host_col_id, Predicate::new_in([KeyValue::string("a")])),
+                ]),
                expected: &["++", "++"],
            },
            // Using a non-existent key column value (for host column) also yields empty result set:
            TestCase {
-                predicates: &[Predicate::new_eq(host_col_id, KeyValue::string("g"))],
+                predicates: predicates([(host_col_id, Predicate::new_in([KeyValue::string("g")]))]),
                expected: &["++", "++"],
            },
            // Using an incorrect type for a key column value in predicate also yields empty result
            // set. TODO: should this be an error?
            TestCase {
-                predicates: &[Predicate::new_eq(host_col_id, KeyValue::Bool(true))],
+                predicates: predicates([(host_col_id, Predicate::new_in([KeyValue::Bool(true)]))]),
                expected: &["++", "++"],
            },
-            // Using a != predicate
+            // Using a NOT IN predicate
            TestCase {
-                predicates: &[Predicate::new_not_eq(region_col_id, KeyValue::string("us"))],
+                predicates: predicates([(
+                    region_col_id,
+                    Predicate::new_not_in([KeyValue::string("us")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -311,10 +332,10 @@ mod tests {
            },
            // Using an IN predicate:
            TestCase {
-                predicates: &[Predicate::new_in(
+                predicates: predicates([(
                    host_col_id,
-                    vec![KeyValue::string("a"), KeyValue::string("b")],
-                )],
+                    Predicate::new_in([KeyValue::string("a"), KeyValue::string("b")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -326,10 +347,10 @@ mod tests {
            },
            // Using a NOT IN predicate:
            TestCase {
-                predicates: &[Predicate::new_not_in(
+                predicates: predicates([(
                    host_col_id,
-                    vec![KeyValue::string("a"), KeyValue::string("b")],
-                )],
+                    Predicate::new_not_in([KeyValue::string("a"), KeyValue::string("b")]),
+                )]),
                expected: &[
                    "+--------+------+-----------------------------+-------+",
                    "| region | host | time                        | usage |",
@ -345,7 +366,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();

            assert_batches_sorted_eq!(t.expected, &batches);
@ -408,16 +429,16 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            TestCase {
-                predicates: &[
-                    Predicate::new_eq(region_col_id, KeyValue::string("us")),
-                    Predicate::new_eq(host_col_id, KeyValue::string("a")),
-                ],
+                predicates: predicates([
+                    (region_col_id, Predicate::new_in([KeyValue::string("us")])),
+                    (host_col_id, Predicate::new_in([KeyValue::string("a")])),
+                ]),
                expected: &[
                    "+--------+------+--------------------------------+-------+",
                    "| region | host | time                           | usage |",
@ -430,7 +451,10 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[Predicate::new_eq(region_col_id, KeyValue::string("us"))],
+                predicates: predicates([(
+                    region_col_id,
+                    Predicate::new_in([KeyValue::string("us")]),
+                )]),
                expected: &[
                    "+--------+------+--------------------------------+-------+",
                    "| region | host | time                           | usage |",
@ -447,7 +471,7 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[Predicate::new_eq(host_col_id, KeyValue::string("a"))],
+                predicates: predicates([(host_col_id, Predicate::new_in([KeyValue::string("a")]))]),
                expected: &[
                    "+--------+------+--------------------------------+-------+",
                    "| region | host | time                           | usage |",
@ -460,7 +484,7 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[Predicate::new_eq(host_col_id, KeyValue::string("b"))],
+                predicates: predicates([(host_col_id, Predicate::new_in([KeyValue::string("b")]))]),
                expected: &[
                    "+--------+------+--------------------------------+-------+",
                    "| region | host | time                           | usage |",
@ -473,7 +497,7 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+--------+------+--------------------------------+-------+",
                    "| region | host | time                           | usage |",
@ -493,7 +517,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();
            assert_batches_sorted_eq!(t.expected, &batches);
        }
@ -536,15 +560,13 @@ mod tests {
        }

        // Check the cache for values:
-        let predicates = &[
-            Predicate::new_eq(region_col_id, KeyValue::string("us")),
-            Predicate::new_eq(host_col_id, KeyValue::string("a")),
-        ];
+        let p = predicates([
+            (region_col_id, Predicate::new_in([KeyValue::string("us")])),
+            (host_col_id, Predicate::new_in([KeyValue::string("a")])),
+        ]);

        // Check what is in the last cache:
-        let batches = cache
-            .to_record_batches(Arc::clone(&table_def), predicates)
-            .unwrap();
+        let batches = cache.to_record_batches(Arc::clone(&table_def), &p).unwrap();

        assert_batches_sorted_eq!(
            [
@ -561,9 +583,7 @@ mod tests {
        thread::sleep(Duration::from_millis(1000));

        // Check what is in the last cache:
-        let batches = cache
-            .to_record_batches(Arc::clone(&table_def), predicates)
-            .unwrap();
+        let batches = cache.to_record_batches(Arc::clone(&table_def), &p).unwrap();

        // The cache is completely empty after the TTL evicted data, so it will give back nothing:
        assert_batches_sorted_eq!(
@ -583,12 +603,10 @@ mod tests {
        }

        // Check the cache for values:
-        let predicates = &[Predicate::new_eq(host_col_id, KeyValue::string("a"))];
+        let p = predicates([(host_col_id, Predicate::new_in([KeyValue::string("a")]))]);

        // Check what is in the last cache:
-        let batches = cache
-            .to_record_batches(Arc::clone(&table_def), predicates)
-            .unwrap();
+        let batches = cache.to_record_batches(Arc::clone(&table_def), &p).unwrap();

        assert_batches_sorted_eq!(
            [
@ -645,14 +663,14 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            // No predicates gives everything:
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+--------------+--------+-------------+-----------+---------+-----------------------------+",
                    "| component_id | active | type        | loc       | reading | time                        |",
@ -668,7 +686,9 @@ mod tests {
            },
            // Predicates on tag key column work as expected:
            TestCase {
-                predicates: &[Predicate::new_eq(component_id_col_id, KeyValue::string("333"))],
+                predicates: predicates([
+                    (component_id_col_id, Predicate::new_in([KeyValue::string("333")]))
+                ]),
                expected: &[
                    "+--------------+--------+--------+------+---------+-----------------------------+",
                    "| component_id | active | type   | loc  | reading | time                        |",
@ -679,7 +699,9 @@ mod tests {
            },
            // Predicate on a non-string field key:
            TestCase {
-                predicates: &[Predicate::new_eq(active_col_id, KeyValue::Bool(false))],
+                predicates: predicates([
+                    (active_col_id, Predicate::new_in([KeyValue::Bool(false)]))
+                ]),
                expected: &[
                    "+--------------+--------+-------------+---------+---------+-----------------------------+",
                    "| component_id | active | type        | loc     | reading | time                        |",
@ -691,7 +713,9 @@ mod tests {
            },
            // Predicate on a string field key:
            TestCase {
-                predicates: &[Predicate::new_eq(type_col_id, KeyValue::string("camera"))],
+                predicates: predicates([
+                    (type_col_id, Predicate::new_in([KeyValue::string("camera")]))
+                ]),
                expected: &[
                    "+--------------+--------+--------+-----------+---------+-----------------------------+",
                    "| component_id | active | type   | loc       | reading | time                        |",
@ -706,7 +730,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();
            assert_batches_sorted_eq!(t.expected, &batches);
        }
@ -748,14 +772,14 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            // No predicates yields everything in the cache
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+-------+--------+-------+-------+-----------------------------+",
                    "| state | county | farm  | speed | time                        |",
@ -771,7 +795,10 @@ mod tests {
            },
            // Predicate on state column, which is part of the series key:
            TestCase {
-                predicates: &[Predicate::new_eq(state_col_id, KeyValue::string("ca"))],
+                predicates: predicates([(
+                    state_col_id,
+                    Predicate::new_in([KeyValue::string("ca")]),
+                )]),
                expected: &[
                    "+-------+--------+-------+-------+-----------------------------+",
                    "| state | county | farm  | speed | time                        |",
@ -787,7 +814,10 @@ mod tests {
            },
            // Predicate on county column, which is part of the series key:
            TestCase {
-                predicates: &[Predicate::new_eq(county_col_id, KeyValue::string("napa"))],
+                predicates: predicates([(
+                    county_col_id,
+                    Predicate::new_in([KeyValue::string("napa")]),
+                )]),
                expected: &[
                    "+-------+--------+-------+-------+-----------------------------+",
                    "| state | county | farm  | speed | time                        |",
@ -799,7 +829,10 @@ mod tests {
            },
            // Predicate on farm column, which is part of the series key:
            TestCase {
-                predicates: &[Predicate::new_eq(farm_col_id, KeyValue::string("30-01"))],
+                predicates: predicates([(
+                    farm_col_id,
+                    Predicate::new_in([KeyValue::string("30-01")]),
+                )]),
                expected: &[
                    "+-------+--------+-------+-------+-----------------------------+",
                    "| state | county | farm  | speed | time                        |",
@ -810,11 +843,14 @@ mod tests {
            },
            // Predicate on all series key columns:
            TestCase {
-                predicates: &[
-                    Predicate::new_eq(state_col_id, KeyValue::string("ca")),
-                    Predicate::new_eq(county_col_id, KeyValue::string("nevada")),
-                    Predicate::new_eq(farm_col_id, KeyValue::string("40-01")),
-                ],
+                predicates: predicates([
+                    (state_col_id, Predicate::new_in([KeyValue::string("ca")])),
+                    (
+                        county_col_id,
+                        Predicate::new_in([KeyValue::string("nevada")]),
+                    ),
+                    (farm_col_id, Predicate::new_in([KeyValue::string("40-01")])),
+                ]),
                expected: &[
                    "+-------+--------+-------+-------+-----------------------------+",
                    "| state | county | farm  | speed | time                        |",
@ -827,7 +863,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();

            assert_batches_sorted_eq!(t.expected, &batches);
@ -870,7 +906,7 @@ mod tests {
            cache.push(row, Arc::clone(&table_def));
        }

-        let batches = cache.to_record_batches(table_def, &[]).unwrap();
+        let batches = cache.to_record_batches(table_def, &predicates([])).unwrap();

        assert_batches_sorted_eq!(
            [
@ -925,14 +961,17 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            // Cache that has values in the zone columns should produce them:
            TestCase {
-                predicates: &[Predicate::new_eq(game_id_col_id, KeyValue::string("4"))],
+                predicates: predicates([(
+                    game_id_col_id,
+                    Predicate::new_in([KeyValue::string("4")]),
+                )]),
                expected: &[
                    "+---------+-----------+-----------------------------+------+------+",
                    "| game_id | player    | time                        | type | zone |",
@ -943,7 +982,10 @@ mod tests {
            },
            // Cache that does not have a zone column will produce it with nulls:
            TestCase {
-                predicates: &[Predicate::new_eq(game_id_col_id, KeyValue::string("1"))],
+                predicates: predicates([(
+                    game_id_col_id,
+                    Predicate::new_in([KeyValue::string("1")]),
+                )]),
                expected: &[
                    "+---------+-----------+-----------------------------+------+------+",
                    "| game_id | player    | time                        | type | zone |",
@ -954,7 +996,7 @@ mod tests {
            },
            // Pulling from multiple caches will fill in with nulls:
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+---------+-----------+-----------------------------+------+------+",
                    "| game_id | player    | time                        | type | zone |",
@ -970,7 +1012,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();

            assert_batches_sorted_eq!(t.expected, &batches);
@ -1028,14 +1070,14 @@ mod tests {
        }

        struct TestCase<'a> {
-            predicates: &'a [Predicate],
+            predicates: IndexMap<ColumnId, Predicate>,
            expected: &'a [&'a str],
        }

        let test_cases = [
            // Can query on specific key column values:
            TestCase {
-                predicates: &[Predicate::new_eq(t1_col_id, KeyValue::string("a"))],
+                predicates: predicates([(t1_col_id, Predicate::new_in([KeyValue::string("a")]))]),
                expected: &[
                    "+----+-----+-----+-----+-----+--------------------------------+",
                    "| t1 | f1  | f2  | f3  | f4  | time                           |",
@ -1045,7 +1087,7 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[Predicate::new_eq(t1_col_id, KeyValue::string("b"))],
+                predicates: predicates([(t1_col_id, Predicate::new_in([KeyValue::string("b")]))]),
                expected: &[
                    "+----+------+----+------+------+--------------------------------+",
                    "| t1 | f1   | f2 | f3   | f4   | time                           |",
@ -1055,7 +1097,7 @@ mod tests {
                ],
            },
            TestCase {
-                predicates: &[Predicate::new_eq(t1_col_id, KeyValue::string("c"))],
+                predicates: predicates([(t1_col_id, Predicate::new_in([KeyValue::string("c")]))]),
                expected: &[
                    "+----+-------+-------+-------+----+--------------------------------+",
                    "| t1 | f1    | f2    | f3    | f4 | time                           |",
@ -1066,7 +1108,7 @@ mod tests {
            },
            // Can query accross key column values:
            TestCase {
-                predicates: &[],
+                predicates: predicates([]),
                expected: &[
                    "+----+-------+-------+-------+------+--------------------------------+",
                    "| t1 | f1    | f2    | f3    | f4   | time                           |",
@ -1081,7 +1123,7 @@ mod tests {

        for t in test_cases {
            let batches = cache
-                .to_record_batches(Arc::clone(&table_def), t.predicates)
+                .to_record_batches(Arc::clone(&table_def), &t.predicates)
                .unwrap();

            assert_batches_sorted_eq!(t.expected, &batches);
@ -1325,4 +1367,267 @@ mod tests {
        });
        insta::assert_json_snapshot!(caches);
    }
+
+    /// This test sets up a [`LastCacheProvider`], creates a [`LastCache`] using the `region` and
+    /// `host` columns as keys, and then writes row data containing several unique combinations of
+    /// the key columns to the cache. It then sets up a DataFusion [`SessionContext`], registers
+    /// the [`LastCacheFunction`] as a UDTF, and runs a series of test cases to verify queries made
+    /// using the function.
+    ///
+    /// The purpose of this is to verify that the predicate pushdown by the UDTF [`TableProvider`]
+    /// is working.
+    ///
+    /// Each test case verifies both the `RecordBatch` output, as well as the output of the `EXPLAIN`
+    /// for a given query. The `EXPLAIN` contains a line for the `LastCacheExec`, which will list
+    /// out any predicates that were pushed down from the provided SQL query to the cache.
+    #[tokio::test]
+    async fn datafusion_udtf_predicate_conversion() {
+        let writer = TestWriter::new();
+        let _ = writer.write_lp_to_write_batch("cpu,region=us-east,host=a usage=99,temp=88", 0);
+
+        // create a last cache provider so we can use it to create our UDTF provider:
+        let db_schema = writer.db_schema();
+        let table_def = db_schema.table_definition("cpu").unwrap();
+        let provider = LastCacheProvider::new_from_catalog(writer.catalog()).unwrap();
+        provider
+            .create_cache(
+                db_schema.id,
+                None,
+                CreateLastCacheArgs {
+                    table_def,
+                    count: LastCacheSize::default(),
+                    ttl: LastCacheTtl::default(),
+                    key_columns: LastCacheKeyColumnsArg::SeriesKey,
+                    value_columns: LastCacheValueColumnsArg::AcceptNew,
+                },
+            )
+            .unwrap();
+
+        // make some writes into the cache:
+        let write_batch = writer.write_lp_to_write_batch(
+            "\
+            cpu,region=us-east,host=a usage=77,temp=66\n\
+            cpu,region=us-east,host=b usage=77,temp=66\n\
+            cpu,region=us-west,host=c usage=77,temp=66\n\
+            cpu,region=us-west,host=d usage=77,temp=66\n\
+            cpu,region=ca-east,host=e usage=77,temp=66\n\
+            cpu,region=ca-cent,host=f usage=77,temp=66\n\
+            cpu,region=ca-west,host=g usage=77,temp=66\n\
+            cpu,region=ca-west,host=h usage=77,temp=66\n\
+            cpu,region=eu-cent,host=i usage=77,temp=66\n\
+            cpu,region=eu-cent,host=j usage=77,temp=66\n\
+            cpu,region=eu-west,host=k usage=77,temp=66\n\
+            cpu,region=eu-west,host=l usage=77,temp=66\n\
+            ",
+            1_000,
+        );
+        let wal_contents = influxdb3_wal::create::wal_contents(
+            (0, 1, 0),
+            [influxdb3_wal::create::write_batch_op(write_batch)],
+        );
+        provider.write_wal_contents_to_cache(&wal_contents);
+
+        let ctx = SessionContext::new();
+        let last_cache_fn = LastCacheFunction::new(db_schema.id, Arc::clone(&provider));
+        ctx.register_udtf(LAST_CACHE_UDTF_NAME, Arc::new(last_cache_fn));
+
+        struct TestCase<'a> {
+            /// A short description of the test
+            _desc: &'a str,
+            /// A SQL expression to evaluate using the datafusion session context, should be of
+            /// the form:
+            /// ```sql
+            /// SELECT * FROM last_cache('cpu') ...
+            /// ```
+            sql: &'a str,
+            /// Expected record batch output
+            expected: &'a [&'a str],
+            /// Expected EXPLAIN output contains this.
+            ///
+            /// For checking the `LastCacheExec` portion of the EXPLAIN output for the given `sql`
+            /// query. A "contains" is used instead of matching the whole EXPLAIN output to prevent
+            /// flakyness from upstream changes to other parts of the query plan.
+            explain_contains: &'a str,
+        }
+
+        let test_cases = [
+            TestCase {
+                _desc: "no predicates",
+                sql: "SELECT * FROM last_cache('cpu')",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| ca-cent | f    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-east | e    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | g    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | h    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | i    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | j    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | k    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | l    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | b    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | d    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains:
+                    "LastCacheExec: inner=MemoryExec: partitions=1, partition_sizes=[12]",
+            },
+            TestCase {
+                _desc: "eq predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') WHERE region = 'us-east'",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | b    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 IN ('us-east')]] inner=MemoryExec: partitions=1, partition_sizes=[2]",
+            },
+            TestCase {
+                _desc: "not eq predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') WHERE region != 'us-east'",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| ca-cent | f    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-east | e    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | g    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | h    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | i    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | j    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | k    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | l    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | d    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 NOT IN ('us-east')]] inner=MemoryExec: partitions=1, partition_sizes=[10]",
+            },
+            TestCase {
+                _desc: "double eq predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') \
+                    WHERE region = 'us-east' \
+                    OR region = 'us-west'",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | b    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | d    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 IN ('us-east','us-west')]] inner=MemoryExec: partitions=1, partition_sizes=[4]",
+            },
+            TestCase {
+                _desc: "triple eq predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') \
+                    WHERE region = 'us-east' \
+                    OR region = 'us-west' \
+                    OR region = 'ca-west'",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| ca-west | g    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | h    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | b    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | d    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 IN ('ca-west','us-east','us-west')]] inner=MemoryExec: partitions=1, partition_sizes=[6]",
+            },
+            TestCase {
+                _desc: "eq predicate on region AND eq predicate on host",
+                sql: "SELECT * FROM last_cache('cpu') \
+                    WHERE (region = 'us-east' OR region = 'us-west') \
+                    AND (host = 'a' OR host = 'c')",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 IN ('us-east','us-west')], [host@1 IN ('a','c')]] inner=MemoryExec: partitions=1, partition_sizes=[2]",
+            },
+            TestCase {
+                _desc: "in predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') \
+                    WHERE region IN ('ca-east', 'ca-west')",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| ca-east | e    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | g    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| ca-west | h    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 IN ('ca-east','ca-west')]] inner=MemoryExec: partitions=1, partition_sizes=[3]",
+            },
+            TestCase {
+                _desc: "not in predicate on region",
+                sql: "SELECT * FROM last_cache('cpu') \
+                    WHERE region NOT IN ('ca-east', 'ca-west')",
+                expected: &[
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| region  | host | temp | time                        | usage |",
+                    "+---------+------+------+-----------------------------+-------+",
+                    "| ca-cent | f    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | i    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-cent | j    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | k    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| eu-west | l    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | a    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-east | b    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | c    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "| us-west | d    | 66.0 | 1970-01-01T00:00:00.000001Z | 77.0  |",
+                    "+---------+------+------+-----------------------------+-------+",
+                ],
+                explain_contains: "LastCacheExec: predicates=[[region@0 NOT IN ('ca-east','ca-west')]] inner=MemoryExec: partitions=1, partition_sizes=[9]",
+            },
+        ];
+
+        for tc in test_cases {
+            // do the query:
+            let results = ctx.sql(tc.sql).await.unwrap().collect().await.unwrap();
+            println!("test case: {}", tc._desc);
+            // check the result:
+            assert_batches_sorted_eq!(tc.expected, &results);
+            let explain = ctx
+                .sql(format!("EXPLAIN {sql}", sql = tc.sql).as_str())
+                .await
+                .unwrap()
+                .collect()
+                .await
+                .unwrap()
+                .pop()
+                .unwrap();
+            assert!(
+                explain
+                    .column_by_name("plan")
+                    .unwrap()
+                    .as_string::<i32>()
+                    .iter()
+                    .any(|plan| plan.is_some_and(|plan| plan.contains(tc.explain_contains))),
+                "explain plan did not contain the expression:\n\n\
+                {expected}\n\n\
+                instead, the output was:\n\n\
+                {actual:#?}",
+                expected = tc.explain_contains,
+                actual = explain.column_by_name("plan").unwrap().as_string::<i32>(),
+            );
+        }
+    }
 }
--- a/influxdb3_cache/src/last_cache/provider.rs
+++ b/influxdb3_cache/src/last_cache/provider.rs
@ -9,7 +9,7 @@ use observability_deps::tracing::debug;
 use parking_lot::RwLock;

 use super::{
-    cache::{LastCache, LastCacheValueColumnsArg, Predicate},
+    cache::{LastCache, LastCacheValueColumnsArg},
    CreateLastCacheArgs, Error,
 };

@ -341,7 +341,6 @@ impl LastCacheProvider {
        db_id: DbId,
        table_id: TableId,
        cache_name: Option<&str>,
-        predicates: &[Predicate],
    ) -> Option<Result<Vec<RecordBatch>, ArrowError>> {
        let table_def = self
            .catalog
@ -362,7 +361,7 @@ impl LastCacheProvider {
                    None
                }
            })
-            .map(|lc| lc.to_record_batches(table_def, predicates))
+            .map(|lc| lc.to_record_batches(table_def, &Default::default()))
    }

    /// Returns the total number of caches contained in the provider
--- a/influxdb3_cache/src/last_cache/table_function.rs
+++ b/influxdb3_cache/src/last_cache/table_function.rs
@ -1,28 +1,49 @@
 use std::{any::Any, sync::Arc};

-use arrow::datatypes::SchemaRef;
+use arrow::{array::RecordBatch, datatypes::SchemaRef};
 use async_trait::async_trait;
 use datafusion::{
    catalog::{Session, TableProvider},
-    common::plan_err,
+    common::{internal_err, plan_err, DFSchema},
    datasource::{function::TableFunctionImpl, TableType},
    error::DataFusionError,
+    execution::context::ExecutionProps,
    logical_expr::TableProviderFilterPushDown,
-    physical_plan::{memory::MemoryExec, ExecutionPlan},
+    physical_expr::{
+        create_physical_expr,
+        utils::{Guarantee, LiteralGuarantee},
+    },
+    physical_plan::{memory::MemoryExec, DisplayAs, DisplayFormatType, ExecutionPlan},
    prelude::Expr,
    scalar::ScalarValue,
 };
+use indexmap::{IndexMap, IndexSet};
 use influxdb3_catalog::catalog::TableDefinition;
-use influxdb3_id::DbId;
+use influxdb3_id::{ColumnId, DbId};
+use schema::{InfluxColumnType, InfluxFieldType};

-use super::LastCacheProvider;
+use super::{
+    cache::{KeyValue, Predicate},
+    LastCacheProvider,
+};

+/// The name of the function that is called to query the last cache
+pub const LAST_CACHE_UDTF_NAME: &str = "last_cache";
+
+/// Implementor of the [`TableProvider`] trait that is produced with a call to the
+/// [`LastCacheFunction`]
 #[derive(Debug)]
 struct LastCacheFunctionProvider {
+    /// The database ID that the query calling to the cache is associated with
    db_id: DbId,
+    /// The table definition that the cache being called is associated with
    table_def: Arc<TableDefinition>,
+    /// The name of the cache
    cache_name: Arc<str>,
+    /// Reference to the cache's schema
    schema: SchemaRef,
+    /// Forwarded reference of the [`LastCacheProvider`], which is used to get the `LastCache`
+    /// for the query using the `db_id` and `table_def`.
    provider: Arc<LastCacheProvider>,
 }

@ -55,19 +76,32 @@ impl TableProvider for LastCacheFunctionProvider {
        _limit: Option<usize>,
    ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
        let read = self.provider.cache_map.read();
-        let batches = if let Some(cache) = read
+        let (predicates, batches) = if let Some(cache) = read
            .get(&self.db_id)
            .and_then(|db| db.get(&self.table_def.table_id))
            .and_then(|tbl| tbl.get(&self.cache_name))
        {
-            let predicates = cache.convert_filter_exprs(filters);
-            cache.to_record_batches(Arc::clone(&self.table_def), &predicates)?
+            let predicates = convert_filter_exprs(
+                self.table_def.as_ref(),
+                cache.key_column_ids.as_ref(),
+                Arc::clone(&self.schema),
+                filters,
+            )?;
+            let batches = cache.to_record_batches(Arc::clone(&self.table_def), &predicates)?;
+            ((!predicates.is_empty()).then_some(predicates), batches)
        } else {
            // If there is no cache, it means that it was removed, in which case, we just return
            // an empty set of record batches.
-            vec![]
+            (None, vec![])
        };
-        let mut exec = MemoryExec::try_new(&[batches], self.schema(), projection.cloned())?;
+        drop(read);
+        let mut exec = LastCacheExec::try_new(
+            predicates,
+            Arc::clone(&self.table_def),
+            &[batches],
+            self.schema(),
+            projection.cloned(),
+        )?;

        let show_sizes = ctx.config_options().explain.show_sizes;
        exec = exec.with_show_sizes(show_sizes);
@ -76,6 +110,181 @@ impl TableProvider for LastCacheFunctionProvider {
    }
 }

+/// Convert the given list of filter expresions `filters` to a map of [`ColumnId`] to [`Predicate`]
+///
+/// The resulting map is an [`IndexMap`] to ensure consistent ordering of entries in the map, which
+/// makes testing the filter conversions easier via `EXPLAIN` query plans.
+fn convert_filter_exprs(
+    table_def: &TableDefinition,
+    cache_key_column_ids: &IndexSet<ColumnId>,
+    cache_schema: SchemaRef,
+    filters: &[Expr],
+) -> Result<IndexMap<ColumnId, Predicate>, DataFusionError> {
+    let mut predicate_map: IndexMap<ColumnId, Option<Predicate>> = IndexMap::new();
+
+    // used by `create_physical_expr` in the loop below:
+    let schema: DFSchema = cache_schema.try_into()?;
+    let props = ExecutionProps::new();
+
+    // The set of `filters` that are passed in from DataFusion varies: 1) based on how they are
+    // defined in the query, and 2) based on some decisions that DataFusion makes when parsing the
+    // query into the `Expr` syntax tree. For example, the predicate:
+    //
+    // WHERE foo IN ('bar', 'baz')
+    //
+    // instead of being expressed as an `InList`, would be simplified to the following `Expr` tree:
+    //
+    // [
+    //     BinaryExpr {
+    //         left: BinaryExpr { left: "foo", op: Eq, right: "bar" },
+    //         op: Or,
+    //         right: BinaryExpr { left: "foo", op: Eq, right: "baz" }
+    //     }
+    // ]
+    //
+    // while the predicate:
+    //
+    // WHERE foo = 'bar' OR foo = 'baz' OR foo = 'bop' OR foo = 'bla'
+    //
+    // instead of being expressed as a tree of `BinaryExpr`s, is expressed as an `InList` with four
+    // entries:
+    //
+    // [
+    //     InList { col: "foo", values: ["bar", "baz", "bop", "bla"], negated: false }
+    // ]
+    //
+    // Instead of handling all the combinations of `Expr`s that may be passed by the caller of
+    // `TableProider::scan`, we can use the cache's schema to convert each `Expr` to a `PhysicalExpr`
+    // and analyze it using DataFusion's `LiteralGuarantee`.
+    //
+    // This will distill the provided set of `Expr`s down to either an IN list, or a NOT IN list
+    // which we can convert to the `Predicate` type for the lastcache.
+    //
+    // Special handling is taken for the case where multiple literal guarantees are encountered for
+    // a given column. This would happen for clauses split with an AND conjunction. From the tests
+    // run thusfar, this happens when a query contains a WHERE clause, e.g.,
+    //
+    // WHERE a != 'foo' AND a != 'bar'
+    //
+    // or,
+    //
+    // WHERE a NOT IN ('foo', 'bar')
+    //
+    // which DataFusion simplifies to the previous clause that uses an AND binary expression.
+
+    for expr in filters {
+        let physical_expr = create_physical_expr(expr, &schema, &props)?;
+        let literal_guarantees = LiteralGuarantee::analyze(&physical_expr);
+        for LiteralGuarantee {
+            column,
+            guarantee,
+            literals,
+        } in literal_guarantees
+        {
+            let Some(column_def) = table_def.column_definition(column.name()) else {
+                return plan_err!(
+                    "invalid column name in filter expression: {}",
+                    column.name()
+                );
+            };
+            // do not handle predicates on non-key columns, let datafusion do that:
+            if !cache_key_column_ids.contains(&column_def.id) {
+                continue;
+            }
+            // convert the literal values from the query into `KeyValue`s for the last cache
+            // predicate, and also validate that the literal type is compatible with the column
+            // being predicated.
+            let value_set = literals
+                .into_iter()
+                .map(|literal| match (literal, column_def.data_type) {
+                    (
+                        ScalarValue::Boolean(Some(b)),
+                        InfluxColumnType::Field(InfluxFieldType::Boolean),
+                    ) => Ok(KeyValue::Bool(b)),
+                    (
+                        ScalarValue::Int64(Some(i)),
+                        InfluxColumnType::Field(InfluxFieldType::Integer),
+                    ) => Ok(KeyValue::Int(i)),
+                    (
+                        ScalarValue::UInt64(Some(u)),
+                        InfluxColumnType::Field(InfluxFieldType::UInteger),
+                    ) => Ok(KeyValue::UInt(u)),
+                    (
+                        ScalarValue::Utf8(Some(s))
+                        | ScalarValue::Utf8View(Some(s))
+                        | ScalarValue::LargeUtf8(Some(s)),
+                        InfluxColumnType::Tag | InfluxColumnType::Field(InfluxFieldType::String),
+                    ) => Ok(KeyValue::String(s)),
+                    // TODO: handle Dictionary here?
+                    (other_literal, column_data_type) => {
+                        plan_err!(
+                            "incompatible literal applied in predicate to column, \
+                            column: {}, \
+                            literal: {other_literal}, \
+                            column type: {column_data_type}",
+                            column.name()
+                        )
+                    }
+                })
+                .collect::<Result<_, DataFusionError>>()?;
+            let mut predicate = match guarantee {
+                Guarantee::In => Predicate::In(value_set),
+                Guarantee::NotIn => Predicate::NotIn(value_set),
+            };
+            // place the predicate into the map, handling the case for a column already encountered
+            predicate_map
+                .entry(column_def.id)
+                .and_modify(|e| {
+                    if let Some(existing) = e {
+                        match (existing, &mut predicate) {
+                            // if we encounter a IN predicate on a column for which we already have
+                            // a IN guarantee, we take their intersection, i.e.,
+                            //
+                            // a IN (1, 2) AND a IN (2, 3)
+                            //
+                            // becomes
+                            //
+                            // a IN (2)
+                            (Predicate::In(ref mut existing_set), Predicate::In(new_set)) => {
+                                *existing_set =
+                                    existing_set.intersection(new_set).cloned().collect();
+                                // if the result is empty, just remove the predicate
+                                if existing_set.is_empty() {
+                                    e.take();
+                                }
+                            }
+                            // if we encounter a NOT IN predicate on a column for which we already
+                            // have a NOT IN guarantee, we extend the two, i.e.,
+                            //
+                            // a NOT IN (1, 2) AND a NOT IN (3, 4)
+                            //
+                            // becomes
+                            //
+                            // a NOT IN (1, 2, 3, 4)
+                            (Predicate::NotIn(existing_set), Predicate::NotIn(new_set)) => {
+                                existing_set.append(new_set)
+                            }
+                            // for non matching predicate types, we just remove by taking the
+                            // Option. We will let DataFusion handle the predicate at a higher
+                            // filter level in this case...
+                            _ => {
+                                e.take();
+                            }
+                        }
+                    }
+                })
+                .or_insert_with(|| Some(predicate));
+        }
+    }
+
+    Ok(predicate_map
+        .into_iter()
+        .filter_map(|(column_id, predicate)| predicate.map(|predicate| (column_id, predicate)))
+        .collect())
+}
+
+/// Implementor of the [`TableFunctionImpl`] trait, to be registered as a user-defined table
+/// function in the DataFusion `SessionContext`.
 #[derive(Debug)]
 pub struct LastCacheFunction {
    db_id: DbId,
@ -127,3 +336,117 @@ impl TableFunctionImpl for LastCacheFunction {
        }))
    }
 }
+
+/// Custom implementor of the [`ExecutionPlan`] trait for use by the last cache
+///
+/// Wraps a [`MemoryExec`] from DataFusion which it relies on for the actual implementation of the
+/// [`ExecutionPlan`] trait. The additional functionality provided by this type is that it tracks
+/// the predicates that are pushed down to the underlying cache during query planning/execution.
+///
+/// # Example
+///
+/// For a query that does not provide any predicates, or one that does provide predicates, but they
+/// do not get pushed down, the `EXPLAIN` for said query will contain a line for the `LastCacheExec`
+/// with no predicates, as well as the info emitted for the inner `MemoryExec`, e.g.,
+///
+/// ```text
+/// LastCacheExec: inner=MemoryExec: partitions=1, partition_sizes=[12]
+/// ```
+///
+/// For queries that do have predicates that get pushed down, the output will include them, e.g.,
+///
+/// ```text
+/// LastCacheExec: predicates=[[region@0 IN ('us-east','us-west')]] inner=[...]
+/// ```
+#[derive(Debug)]
+struct LastCacheExec {
+    inner: MemoryExec,
+    table_def: Arc<TableDefinition>,
+    predicates: Option<IndexMap<ColumnId, Predicate>>,
+}
+
+impl LastCacheExec {
+    fn try_new(
+        predicates: Option<IndexMap<ColumnId, Predicate>>,
+        table_def: Arc<TableDefinition>,
+        partitions: &[Vec<RecordBatch>],
+        cache_schema: SchemaRef,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Self, DataFusionError> {
+        Ok(Self {
+            inner: MemoryExec::try_new(partitions, cache_schema, projection)?,
+            table_def,
+            predicates,
+        })
+    }
+
+    fn with_show_sizes(self, show_sizes: bool) -> Self {
+        Self {
+            inner: self.inner.with_show_sizes(show_sizes),
+            ..self
+        }
+    }
+}
+
+impl DisplayAs for LastCacheExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "LastCacheExec:")?;
+                if let Some(predicates) = self.predicates.as_ref() {
+                    write!(f, " predicates=[")?;
+                    let mut p_iter = predicates.iter();
+                    while let Some((col_id, predicate)) = p_iter.next() {
+                        let col_name = self.table_def.column_id_to_name(col_id).unwrap_or_default();
+                        write!(f, "[{col_name}@{col_id} {predicate}]")?;
+                        if p_iter.size_hint().0 > 0 {
+                            write!(f, ", ")?;
+                        }
+                    }
+                    write!(f, "]")?;
+                }
+                write!(f, " inner=")?;
+                self.inner.fmt_as(t, f)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for LastCacheExec {
+    fn name(&self) -> &str {
+        "LastCacheExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+        self.inner.properties()
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        self.inner.children()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        // (copied from MemoryExec):
+        // MemoryExec has no children
+        if children.is_empty() {
+            Ok(self)
+        } else {
+            internal_err!("Children cannot be replaced in {self:?}")
+        }
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<datafusion::execution::TaskContext>,
+    ) -> datafusion::error::Result<datafusion::execution::SendableRecordBatchStream> {
+        self.inner.execute(partition, context)
+    }
+}
--- a/influxdb3_server/src/query_executor.rs
+++ b/influxdb3_server/src/query_executor.rs
@ -19,7 +19,7 @@ use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::Expr;
 use datafusion_util::config::DEFAULT_SCHEMA;
 use datafusion_util::MemoryStream;
-use influxdb3_cache::last_cache::LastCacheFunction;
+use influxdb3_cache::last_cache::{LastCacheFunction, LAST_CACHE_UDTF_NAME};
 use influxdb3_cache::meta_cache::{MetaCacheFunction, META_CACHE_UDTF_NAME};
 use influxdb3_catalog::catalog::{Catalog, DatabaseSchema};
 use influxdb3_sys_events::SysEventStore;
@ -491,8 +491,6 @@ impl QueryNamespace for Database {
    }
 }

-const LAST_CACHE_UDTF_NAME: &str = "last_cache";
-
 impl CatalogProvider for Database {
    fn as_any(&self) -> &dyn Any {
        self as &dyn Any
--- a/influxdb3_write/src/write_buffer/mod.rs
+++ b/influxdb3_write/src/write_buffer/mod.rs
@ -974,7 +974,7 @@ mod tests {
        ];
        let actual = wbuf
            .last_cache_provider()
-            .get_cache_record_batches(db_id, tbl_id, None, &[])
+            .get_cache_record_batches(db_id, tbl_id, None)
            .unwrap()
            .unwrap();
        assert_batches_eq!(&expected, &actual);