Merge branch 'main' into savage/await-enqueue-rotation-returned-receiver-during-shutdown

2023-07-24 09:17:55 +00:00 · 2023-07-24 09:17:55 +00:00 · 0ed4e8509b
parent a14afafcd9 79bb1347e6
commit 0ed4e8509b
20 changed files with 1744 additions and 239 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -855,9 +855,9 @@ dependencies = [

 [[package]]
 name = "clap"
-version = "4.3.17"
+version = "4.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0827b011f6f8ab38590295339817b0d26f344aa4932c3ced71b45b0c54b4a9"
+checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d"
 dependencies = [
 "clap_builder",
 "clap_derive",
@ -887,9 +887,9 @@ dependencies = [

 [[package]]
 name = "clap_builder"
-version = "4.3.17"
+version = "4.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9441b403be87be858db6a23edb493e7f694761acdc3343d5a0fcaafd304cbc9e"
+checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1"
 dependencies = [
 "anstream",
 "anstyle",
@ -5683,9 +5683,9 @@ checksum = "d3543ca0810e71767052bdcdd5653f23998b192642a22c5164bfa6581e40a4a2"

 [[package]]
 name = "sysinfo"
-version = "0.29.5"
+version = "0.29.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b949f01f9c23823744b71e0060472ecbde578ef68cc2a9e46d114efd77c3034"
+checksum = "c7cb97a5a85a136d84e75d5c3cf89655090602efb1be0d8d5337b7e386af2908"
 dependencies = [
 "cfg-if",
 "core-foundation-sys",
@ -6090,9 +6090,9 @@ dependencies = [

 [[package]]
 name = "tower-http"
-version = "0.4.2"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ac8060a61f8758a61562f6fb53ba3cbe1ca906f001df2e53cccddcdbee91e7c"
+checksum = "55ae70283aba8d2a8b411c695c437fe25b8b5e44e23e780662002fc72fb47a82"
 dependencies = [
 "bitflags 2.3.3",
 "bytes",
--- a/cache_system/src/loader/batch.rs
+++ b/cache_system/src/loader/batch.rs
@ -105,8 +105,6 @@ where
    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
 {
    async fn flush(&self) {
-        trace!("flushing batch loader");
-
        let pending: Vec<_> = {
            let mut pending = self.inner.pending.lock();
            std::mem::take(pending.as_mut())
@ -115,6 +113,8 @@ where
        if pending.is_empty() {
            return;
        }
+        trace!(n_pending = pending.len(), "flush batch loader",);
+
        let job_id = self.inner.job_id_counter.fetch_add(1, Ordering::SeqCst);
        let handle_recv = CancellationSafeFutureReceiver::default();

@ -221,6 +221,15 @@ where

            if !pending.is_empty() {
                self.flush().await;
+
+                // prevent hot-looping:
+                // It seems that in some cases the underlying loader is ready but the data is not available via the
+                // cache driver yet. This is likely due to the signalling system within the cache driver that prevents
+                // cancelation, but also allows side-loading and at the same time prevents that the same key is loaded
+                // multiple times. Tokio doesn't know that this method here is basically a wait loop. So we yield back
+                // to the tokio worker and to allow it to make some progress. Since flush+load take some time anyways,
+                // this yield here is not overall performance critical.
+                tokio::task::yield_now().await;
            }

            futures = pending;
--- a/compactor/src/components/round_info_source/mod.rs
+++ b/compactor/src/components/round_info_source/mod.rs
@ -103,7 +103,9 @@ impl LevelBasedRoundInfo {
        // branch in the worst case, thus if that would result in too many files to compact in a single
        // plan, run a pre-phase to reduce the number of files first
        let num_overlapped_files = get_num_overlapped_files(start_level_files, next_level_files);
-        if num_start_level + num_overlapped_files > self.max_num_files_per_plan {
+        if num_start_level > 1
+            && num_start_level + num_overlapped_files > self.max_num_files_per_plan
+        {
            // This scaenario meets the simple criteria of start level files + their overlaps are lots of files.
            // But ManySmallFiles implies we must compact only within the start level to reduce the quantity of
            // start level files. There are several reasons why that might be unhelpful.
--- a/compactor/tests/layouts/stuck.rs
+++ b/compactor/tests/layouts/stuck.rs
@ -1730,3 +1730,414 @@ async fn stuck_l0_large_l0s() {
    "###
    );
 }
+
+// This case is taken from a catalog where the partition was stuck doing single file L0->L0 compactions with a ManySmallFiles classification.
+// The key point is that there is 1 L0 file, and enough overlapping L1 files such that the sum of L0 and overlapping L1s are too many for
+// a single compaction.  So it it tried to do L0->L0 compaction, but you can't get less than 1 L0 file...
+#[tokio::test]
+async fn single_file_compaction() {
+    test_helpers::maybe_start_logging();
+
+    let max_files = 20;
+    let setup = layout_setup_builder()
+        .await
+        .with_max_num_files_per_plan(max_files)
+        .with_max_desired_file_size_bytes(MAX_DESIRED_FILE_SIZE)
+        .with_partition_timeout(Duration::from_millis(1000))
+        .with_suppress_run_output() // remove this to debug
+        .build()
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681776057065884000)
+                .with_max_time(1681848094846357000)
+                .with_compaction_level(CompactionLevel::Final)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681848108803007952))
+                .with_file_size_bytes(148352),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681848059723530000)
+                .with_max_time(1681849022292840000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681849158083717413))
+                .with_file_size_bytes(8532),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681849256770938000)
+                .with_max_time(1681849612137939000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681849758018522867))
+                .with_file_size_bytes(7180),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681849857540998000)
+                .with_max_time(1681849933405747000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681850058063700468))
+                .with_file_size_bytes(6354),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681850155949687000)
+                .with_max_time(1681850525337964000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681850658095040165))
+                .with_file_size_bytes(7224),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681850533564810000)
+                .with_max_time(1681850800324334000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681850958072081740))
+                .with_file_size_bytes(6442),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681850807902300000)
+                .with_max_time(1681851109057342000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681851258099471556))
+                .with_file_size_bytes(6467),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681851356697599000)
+                .with_max_time(1681851731606438000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681851858069516381))
+                .with_file_size_bytes(7202),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681851768198276000)
+                .with_max_time(1681852656555310000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681852758025054620))
+                .with_file_size_bytes(7901),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681852858788440000)
+                .with_max_time(1681853202074816000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681853358030917913))
+                .with_file_size_bytes(7175),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681853216031150000)
+                .with_max_time(1681853533814380000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681853658084495307))
+                .with_file_size_bytes(6461),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681853755089369000)
+                .with_max_time(1681854114135030000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681854258102937522))
+                .with_file_size_bytes(7172),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681854158528835000)
+                .with_max_time(1681854411758250000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681854558107269518))
+                .with_file_size_bytes(6445),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681854656198860000)
+                .with_max_time(1681855901530453000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681856058068217803))
+                .with_file_size_bytes(9388),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681855930016632000)
+                .with_max_time(1681856215951881000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681856358077776391))
+                .with_file_size_bytes(6411),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681856457094364000)
+                .with_max_time(1681856572199715000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681856658099983774))
+                .with_file_size_bytes(6471),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681856755669647000)
+                .with_max_time(1681856797376786000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681856959540758502))
+                .with_file_size_bytes(6347),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681857059467239000)
+                .with_max_time(1681857411709822000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681857559463607724))
+                .with_file_size_bytes(7179),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681857658708732000)
+                .with_max_time(1681858001258834000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681858159653340111))
+                .with_file_size_bytes(7171),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681858259089021000)
+                .with_max_time(1681858311972651000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681858459694290981))
+                .with_file_size_bytes(6417),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681858336136281000)
+                .with_max_time(1681858611711634000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681858759770566450))
+                .with_file_size_bytes(6432),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681858613076367000)
+                .with_max_time(1681859207290151000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681859359651203045))
+                .with_file_size_bytes(7211),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681859212497834000)
+                .with_max_time(1681859549996540000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681859659796715205))
+                .with_file_size_bytes(6408),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681859755984961000)
+                .with_max_time(1681860397139689000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681860559596560745))
+                .with_file_size_bytes(7919),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681860656403220000)
+                .with_max_time(1681861312602593000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681861463769557785))
+                .with_file_size_bytes(7920),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681861557592893000)
+                .with_max_time(1681861592762435000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681861760075293126))
+                .with_file_size_bytes(6432),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681861612304587000)
+                .with_max_time(1681861928505695000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681862059957822724))
+                .with_file_size_bytes(6456),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681862008720364000)
+                .with_max_time(1681862268794595000)
+                .with_compaction_level(CompactionLevel::FileNonOverlapped)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1681862511938856063))
+                .with_file_size_bytes(6453),
+        )
+        .await;
+
+    setup
+        .partition
+        .create_parquet_file(
+            parquet_builder()
+                .with_min_time(1681776002714783000)
+                .with_max_time(1681862102913137000)
+                .with_compaction_level(CompactionLevel::Initial)
+                .with_max_l0_created_at(Time::from_timestamp_nanos(1683039505904263771))
+                .with_file_size_bytes(7225),
+        )
+        .await;
+
+    insta::assert_yaml_snapshot!(
+        run_layout_scenario(&setup).await,
+        @r###"
+    ---
+    - "**** Input Files "
+    - "L0                                                                                                                 "
+    - "L0.29[1681776002714783000,1681862102913137000] 1683039505.9s 7kb|-----------------------------------------L0.29-----------------------------------------| "
+    - "L1                                                                                                                 "
+    - "L1.2[1681848059723530000,1681849022292840000] 1681849158.08s 8kb                                                                           |L1.2|         "
+    - "L1.3[1681849256770938000,1681849612137939000] 1681849758.02s 7kb                                                                            |L1.3|        "
+    - "L1.4[1681849857540998000,1681849933405747000] 1681850058.06s 6kb                                                                             |L1.4|       "
+    - "L1.5[1681850155949687000,1681850525337964000] 1681850658.1s 7kb                                                                             |L1.5|       "
+    - "L1.6[1681850533564810000,1681850800324334000] 1681850958.07s 6kb                                                                             |L1.6|       "
+    - "L1.7[1681850807902300000,1681851109057342000] 1681851258.1s 6kb                                                                              |L1.7|      "
+    - "L1.8[1681851356697599000,1681851731606438000] 1681851858.07s 7kb                                                                              |L1.8|      "
+    - "L1.9[1681851768198276000,1681852656555310000] 1681852758.03s 8kb                                                                               |L1.9|     "
+    - "L1.10[1681852858788440000,1681853202074816000] 1681853358.03s 7kb                                                                                |L1.10|   "
+    - "L1.11[1681853216031150000,1681853533814380000] 1681853658.08s 6kb                                                                                |L1.11|   "
+    - "L1.12[1681853755089369000,1681854114135030000] 1681854258.1s 7kb                                                                                 |L1.12|  "
+    - "L1.13[1681854158528835000,1681854411758250000] 1681854558.11s 6kb                                                                                 |L1.13|  "
+    - "L1.14[1681854656198860000,1681855901530453000] 1681856058.07s 9kb                                                                                  |L1.14| "
+    - "L1.15[1681855930016632000,1681856215951881000] 1681856358.08s 6kb                                                                                   |L1.15|"
+    - "L1.16[1681856457094364000,1681856572199715000] 1681856658.1s 6kb                                                                                   |L1.16|"
+    - "L1.17[1681856755669647000,1681856797376786000] 1681856959.54s 6kb                                                                                    |L1.17|"
+    - "L1.18[1681857059467239000,1681857411709822000] 1681857559.46s 7kb                                                                                    |L1.18|"
+    - "L1.19[1681857658708732000,1681858001258834000] 1681858159.65s 7kb                                                                                     |L1.19|"
+    - "L1.20[1681858259089021000,1681858311972651000] 1681858459.69s 6kb                                                                                     |L1.20|"
+    - "L1.21[1681858336136281000,1681858611711634000] 1681858759.77s 6kb                                                                                     |L1.21|"
+    - "L1.22[1681858613076367000,1681859207290151000] 1681859359.65s 7kb                                                                                      |L1.22|"
+    - "L1.23[1681859212497834000,1681859549996540000] 1681859659.8s 6kb                                                                                      |L1.23|"
+    - "L1.24[1681859755984961000,1681860397139689000] 1681860559.6s 8kb                                                                                       |L1.24|"
+    - "L1.25[1681860656403220000,1681861312602593000] 1681861463.77s 8kb                                                                                        |L1.25|"
+    - "L1.26[1681861557592893000,1681861592762435000] 1681861760.08s 6kb                                                                                         |L1.26|"
+    - "L1.27[1681861612304587000,1681861928505695000] 1681862059.96s 6kb                                                                                         |L1.27|"
+    - "L1.28[1681862008720364000,1681862268794595000] 1681862511.94s 6kb                                                                                         |L1.28|"
+    - "L2                                                                                                                 "
+    - "L2.1[1681776057065884000,1681848094846357000] 1681848108.8s 145kb|----------------------------------L2.1-----------------------------------|               "
+    - "**** Final Output Files (192kb written)"
+    - "L1                                                                                                                 "
+    - "L1.30[1681776002714783000,1681862268794595000] 1683039505.9s 192kb|-----------------------------------------L1.30------------------------------------------|"
+    - "L2                                                                                                                 "
+    - "L2.1[1681776057065884000,1681848094846357000] 1681848108.8s 145kb|----------------------------------L2.1-----------------------------------|               "
+    "###
+    );
+}
--- a/influxdb_iox/tests/query_tests/cases/in/issue_6112.influxql
+++ b/influxdb_iox/tests/query_tests/cases/in/issue_6112.influxql
@ -339,6 +339,12 @@ SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
 -- supports offset parameter
 SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s, 1s) FILL(none);

+-- N.B. The gap filling of the COUNT(usage_idle) and COUNT(bytes_free)
+-- columns happens before the two measurements are UNIONed together
+-- when producing the output table. This means that a COUNT column for
+-- a field that is not present for a measurement will contain NULLs,
+-- rather than being filled with 0s. This is consistent with older
+-- versions of influxdb.
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk;
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY TIME(1s) FILL(none);
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY cpu;
@ -360,7 +366,9 @@ SELECT COUNT(usage_idle), usage_idle FROM cpu;

 -- Default FILL(null) when FILL is omitted
 SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
+SELECT COUNT(usage_idle)+2 FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
+SELECT COUNT(usage_idle)+1, COUNT(bytes_free)+2 FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
 SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(null);
 SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(null);
 SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(previous);
--- a/influxdb_iox/tests/query_tests/cases/in/issue_6112.influxql.expected
+++ b/influxdb_iox/tests/query_tests/cases/in/issue_6112.influxql.expected
@ -919,10 +919,10 @@ name: logical_plan
 plan
 Sort: iox::measurement ASC NULLS LAST, tag0 ASC NULLS LAST, time ASC NULLS LAST
   Union
-     Projection: Dictionary(Int32, Utf8("m0")) AS iox::measurement, TimestampNanosecond(0, None) AS time, m0.tag0 AS tag0, COUNT(m0.f64) AS count, SUM(m0.f64) AS sum, STDDEV(m0.f64) AS stddev
+     Projection: Dictionary(Int32, Utf8("m0")) AS iox::measurement, TimestampNanosecond(0, None) AS time, m0.tag0 AS tag0, coalesce_struct(COUNT(m0.f64), Int64(0)) AS count, SUM(m0.f64) AS sum, STDDEV(m0.f64) AS stddev
       Aggregate: groupBy=[[m0.tag0]], aggr=[[COUNT(m0.f64), SUM(m0.f64), STDDEV(m0.f64)]]
         TableScan: m0 projection=[f64, tag0]
-     Projection: Dictionary(Int32, Utf8("m1")) AS iox::measurement, TimestampNanosecond(0, None) AS time, m1.tag0 AS tag0, COUNT(m1.f64) AS count, SUM(m1.f64) AS sum, STDDEV(m1.f64) AS stddev
+     Projection: Dictionary(Int32, Utf8("m1")) AS iox::measurement, TimestampNanosecond(0, None) AS time, m1.tag0 AS tag0, coalesce_struct(COUNT(m1.f64), Int64(0)) AS count, SUM(m1.f64) AS sum, STDDEV(m1.f64) AS stddev
       Aggregate: groupBy=[[m1.tag0]], aggr=[[COUNT(m1.f64), SUM(m1.f64), STDDEV(m1.f64)]]
         TableScan: m1 projection=[f64, tag0]
 name: physical_plan
@ -930,7 +930,7 @@ name: physical_plan
 SortPreservingMergeExec: [iox::measurement@0 ASC NULLS LAST,tag0@2 ASC NULLS LAST,time@1 ASC NULLS LAST]
   UnionExec
     SortExec: expr=[iox::measurement@0 ASC NULLS LAST,tag0@2 ASC NULLS LAST,time@1 ASC NULLS LAST]
-       ProjectionExec: expr=[m0 as iox::measurement, 0 as time, tag0@0 as tag0, COUNT(m0.f64)@1 as count, SUM(m0.f64)@2 as sum, STDDEV(m0.f64)@3 as stddev]
+       ProjectionExec: expr=[m0 as iox::measurement, 0 as time, tag0@0 as tag0, coalesce_struct(COUNT(m0.f64)@1, 0) as count, SUM(m0.f64)@2 as sum, STDDEV(m0.f64)@3 as stddev]
         AggregateExec: mode=FinalPartitioned, gby=[tag0@0 as tag0], aggr=[COUNT(m0.f64), SUM(m0.f64), STDDEV(m0.f64)]
           CoalesceBatchesExec: target_batch_size=8192
             RepartitionExec: partitioning=Hash([tag0@0], 4), input_partitions=4
@ -938,7 +938,7 @@ name: physical_plan
                 AggregateExec: mode=Partial, gby=[tag0@1 as tag0], aggr=[COUNT(m0.f64), SUM(m0.f64), STDDEV(m0.f64)]
                   ParquetExec: file_groups={1 group: [[1/1/1/00000000-0000-0000-0000-000000000000.parquet]]}, projection=[f64, tag0]
     SortExec: expr=[iox::measurement@0 ASC NULLS LAST,tag0@2 ASC NULLS LAST,time@1 ASC NULLS LAST]
-       ProjectionExec: expr=[m1 as iox::measurement, 0 as time, tag0@0 as tag0, COUNT(m1.f64)@1 as count, SUM(m1.f64)@2 as sum, STDDEV(m1.f64)@3 as stddev]
+       ProjectionExec: expr=[m1 as iox::measurement, 0 as time, tag0@0 as tag0, coalesce_struct(COUNT(m1.f64)@1, 0) as count, SUM(m1.f64)@2 as sum, STDDEV(m1.f64)@3 as stddev]
         RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=4
           AggregateExec: mode=FinalPartitioned, gby=[tag0@0 as tag0], aggr=[COUNT(m1.f64), SUM(m1.f64), STDDEV(m1.f64)], ordering_mode=FullyOrdered
             CoalesceBatchesExec: target_batch_size=8192
@ -1267,9 +1267,19 @@ name: cpu
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 6     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
+---------------------+-------+
+-- InfluxQL: SELECT COUNT(usage_idle)+2 FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
+name: cpu
+---------------------+-------+
+| time                | count |
+---------------------+-------+
+| 2022-10-31T02:00:00 | 8     |
+| 2022-10-31T02:00:30 | 2     |
+| 2022-10-31T02:01:00 | 2     |
+| 2022-10-31T02:01:30 | 2     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
 name: cpu
@ -1277,18 +1287,37 @@ name: cpu
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 | 6     |         |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 | 0     |         |
+| 2022-10-31T02:01:00 | 0     |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: disk
 +---------------------+-------+---------+
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 |       | 6       |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 |       | 0       |
+| 2022-10-31T02:01:00 |       | 0       |
+| 2022-10-31T02:01:30 |       | 0       |
+---------------------+-------+---------+
+-- InfluxQL: SELECT COUNT(usage_idle)+1, COUNT(bytes_free)+2 FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s);
+name: cpu
+---------------------+-------+---------+
+| time                | count | count_1 |
+---------------------+-------+---------+
+| 2022-10-31T02:00:00 | 7     |         |
+| 2022-10-31T02:00:30 | 1     |         |
+| 2022-10-31T02:01:00 | 1     |         |
+| 2022-10-31T02:01:30 | 1     |         |
+---------------------+-------+---------+
+name: disk
+---------------------+-------+---------+
+| time                | count | count_1 |
+---------------------+-------+---------+
+| 2022-10-31T02:00:00 |       | 8       |
+| 2022-10-31T02:00:30 |       | 2       |
+| 2022-10-31T02:01:00 |       | 2       |
+| 2022-10-31T02:01:30 |       | 2       |
 +---------------------+-------+---------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(null);
 name: cpu
@ -1296,9 +1325,9 @@ name: cpu
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 6     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(null);
 name: cpu
@ -1306,18 +1335,18 @@ name: cpu
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 | 6     |         |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 | 0     |         |
+| 2022-10-31T02:01:00 | 0     |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: disk
 +---------------------+-------+---------+
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 |       | 6       |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 |       | 0       |
+| 2022-10-31T02:01:00 |       | 0       |
+| 2022-10-31T02:01:30 |       | 0       |
 +---------------------+-------+---------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) FILL(previous);
 name: cpu
@ -1507,9 +1536,9 @@ tags: cpu=cpu-total
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu0
@ -1517,9 +1546,9 @@ tags: cpu=cpu0
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu1
@ -1527,9 +1556,9 @@ tags: cpu=cpu1
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s), cpu FILL(null);
 name: cpu
@ -1538,9 +1567,9 @@ tags: cpu=cpu-total
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu0
@ -1548,9 +1577,9 @@ tags: cpu=cpu0
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu1
@ -1558,9 +1587,9 @@ tags: cpu=cpu1
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:00:30 | 0     |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s), cpu, device FILL(null);
 name: cpu
@ -1569,9 +1598,9 @@ tags: cpu=cpu-total, device=
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 | 2     |         |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 | 0     |         |
+| 2022-10-31T02:01:00 | 0     |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: cpu
 tags: cpu=cpu0, device=
@ -1579,9 +1608,9 @@ tags: cpu=cpu0, device=
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 | 2     |         |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 | 0     |         |
+| 2022-10-31T02:01:00 | 0     |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: cpu
 tags: cpu=cpu1, device=
@ -1589,9 +1618,9 @@ tags: cpu=cpu1, device=
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 | 2     |         |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 | 0     |         |
+| 2022-10-31T02:01:00 | 0     |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: disk
 tags: cpu=, device=disk1s1
@ -1599,9 +1628,9 @@ tags: cpu=, device=disk1s1
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 |       | 2       |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 |       | 0       |
+| 2022-10-31T02:01:00 |       | 0       |
+| 2022-10-31T02:01:30 |       | 0       |
 +---------------------+-------+---------+
 name: disk
 tags: cpu=, device=disk1s2
@ -1609,9 +1638,9 @@ tags: cpu=, device=disk1s2
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 |       | 2       |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 |       | 0       |
+| 2022-10-31T02:01:00 |       | 0       |
+| 2022-10-31T02:01:30 |       | 0       |
 +---------------------+-------+---------+
 name: disk
 tags: cpu=, device=disk1s5
@ -1619,9 +1648,9 @@ tags: cpu=, device=disk1s5
 | time                | count | count_1 |
 +---------------------+-------+---------+
 | 2022-10-31T02:00:00 |       | 2       |
-| 2022-10-31T02:00:30 |       |         |
-| 2022-10-31T02:01:00 |       |         |
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:00:30 |       | 0       |
+| 2022-10-31T02:01:00 |       | 0       |
+| 2022-10-31T02:01:30 |       | 0       |
 +---------------------+-------+---------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s), cpu FILL(previous);
 name: cpu
@ -2202,15 +2231,15 @@ name: cpu
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 6     |
-| 2022-10-31T02:00:30 |       |
+| 2022-10-31T02:00:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:05:00Z' GROUP BY TIME(30s) LIMIT 2 OFFSET 2;
 name: cpu
 +---------------------+-------+
 | time                | count |
 +---------------------+-------+
-| 2022-10-31T02:01:00 |       |
-| 2022-10-31T02:01:30 |       |
+| 2022-10-31T02:01:00 | 0     |
+| 2022-10-31T02:01:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle) FROM cpu WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:05:00Z' GROUP BY TIME(30s), cpu LIMIT 2;
 name: cpu
@ -2219,7 +2248,7 @@ tags: cpu=cpu-total
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
+| 2022-10-31T02:00:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu0
@ -2227,7 +2256,7 @@ tags: cpu=cpu0
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
+| 2022-10-31T02:00:30 | 0     |
 +---------------------+-------+
 name: cpu
 tags: cpu=cpu1
@ -2235,7 +2264,7 @@ tags: cpu=cpu1
 | time                | count |
 +---------------------+-------+
 | 2022-10-31T02:00:00 | 2     |
-| 2022-10-31T02:00:30 |       |
+| 2022-10-31T02:00:30 | 0     |
 +---------------------+-------+
 -- InfluxQL: SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s) LIMIT 1;
 name: cpu
@ -2268,13 +2297,13 @@ name: cpu
 +---------------------+-------+---------+
 | time                | count | count_1 |
 +---------------------+-------+---------+
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:01:30 | 0     |         |
 +---------------------+-------+---------+
 name: disk
 +---------------------+-------+---------+
 | time                | count | count_1 |
 +---------------------+-------+---------+
-| 2022-10-31T02:01:30 |       |         |
+| 2022-10-31T02:01:30 |       | 0       |
 +---------------------+-------+---------+
 -- InfluxQL: SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(30s), cpu, device LIMIT 1;
 name: cpu
--- a/influxdb_iox/tests/query_tests/cases/in/window_like.influxql
+++ b/influxdb_iox/tests/query_tests/cases/in/window_like.influxql
@ -21,6 +21,19 @@ SELECT difference(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AN
 -- group by time and a tag
 SELECT difference(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;

+--
+-- difference + selector
+--
+SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of difference
+SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+-- linear filling of selector functions produces an execution error
+-- (see https://github.com/influxdata/influxdb_iox/issues/8302).
+-- SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- group by time and a tag
+SELECT difference(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;

 --
 -- non_negative_difference
@ -35,6 +48,11 @@ SELECT non_negative_difference(usage_idle) FROM cpu WHERE time >= 00000001300000
 --
 SELECT non_negative_difference(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;

+--
+-- non_negative_difference + selector
+--
+SELECT non_negative_difference(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+
 --
 -- moving_average
 --
@ -61,6 +79,17 @@ SELECT moving_average(mean(writes), 3) FROM diskio WHERE time >= 000000013000000
 SELECT moving_average(mean(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
 SELECT moving_average(mean(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);

+--
+-- moving_average + selector
+--
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of moving_average
+SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+-- linear filling of selector functions produces an execution error
+-- (see https://github.com/influxdata/influxdb_iox/issues/8302).
+-- SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+
 --
 -- combining window functions
 --
@ -109,7 +138,7 @@ SELECT derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AN
 SELECT derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
 SELECT derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
 SELECT derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of difference
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of derivative
 SELECT derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
 SELECT derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
 SELECT derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
@ -120,6 +149,26 @@ SELECT derivative(mean(writes), 500ms) FROM diskio WHERE time >= 000000013000000
 SELECT derivative(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
 SELECT derivative(mean(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;

+--
+-- derivative + selector
+--
+SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `first` generates windows with NULL values to test NULL handling of derivative
+SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+-- linear filling of selector functions produces an execution error
+-- (see https://github.com/influxdata/influxdb_iox/issues/8302).
+-- SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- group by time and a tag
+SELECT derivative(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+SELECT derivative(first(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+
 --
 -- non_negative_derivative
 --
@ -138,7 +187,7 @@ SELECT non_negative_derivative(mean(writes)) FROM diskio WHERE time >= 000000013
 SELECT non_negative_derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
 SELECT non_negative_derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
 SELECT non_negative_derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of difference
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `mean` generates windows with NULL values to test NULL handling of non_negative_derivative
 SELECT non_negative_derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
 SELECT non_negative_derivative(mean(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
 SELECT non_negative_derivative(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
@ -149,6 +198,26 @@ SELECT non_negative_derivative(mean(writes), 500ms) FROM diskio WHERE time >= 00
 SELECT non_negative_derivative(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
 SELECT non_negative_derivative(mean(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;

+--
+-- non_negative_derivative + selector
+--
+SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `first` generates windows with NULL values to test NULL handling of non_negative_derivative
+SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+-- linear filling of selector functions produces an execution error
+-- (see https://github.com/influxdata/influxdb_iox/issues/8302).
+-- SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- group by time and a tag
+SELECT non_negative_derivative(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+SELECT non_negative_derivative(first(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+
 --
 -- cumulative_sum
 --
@ -167,4 +236,18 @@ SELECT cumulative_sum(mean(writes)) FROM diskio WHERE time >= 000000013000000000
 SELECT cumulative_sum(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
 SELECT cumulative_sum(mean(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
 -- group by time and a tag
-SELECT cumulative_sum(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+SELECT cumulative_sum(mean(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+
+--
+-- cumulative_sum + selector
+--
+SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+-- the input data is regular data at 10s intervals, so 7s windows ensure the `first` generates windows with NULL values to test NULL handling of cumulative_sum
+SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+-- linear filling of selector functions produces an execution error
+-- (see https://github.com/influxdata/influxdb_iox/issues/8302).
+-- SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(linear);
+-- group by time and a tag
+SELECT cumulative_sum(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
--- a/influxdb_iox/tests/query_tests/cases/in/window_like.influxql.expected
+++ b/influxdb_iox/tests/query_tests/cases/in/window_like.influxql.expected
@ -148,6 +148,86 @@ tags: cpu=cpu1
 | 1970-01-01T00:02:30 | -0.03333333333334565 |
 | 1970-01-01T00:03:00 | -0.03333333333333144 |
 +---------------------+----------------------+
+-- InfluxQL: SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+------------+
+| time                | difference |
+---------------------+------------+
+| 1970-01-01T00:02:20 | 164        |
+| 1970-01-01T00:02:27 | 187        |
+| 1970-01-01T00:02:34 | 112        |
+| 1970-01-01T00:02:48 | 110        |
+| 1970-01-01T00:02:55 | 219        |
+| 1970-01-01T00:03:09 | 75         |
+| 1970-01-01T00:03:16 | 76         |
+| 1970-01-01T00:03:30 | 146        |
+---------------------+------------+
+-- InfluxQL: SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+------------+
+| time                | difference |
+---------------------+------------+
+| 1970-01-01T00:02:00 | 366        |
+| 1970-01-01T00:02:30 | 421        |
+| 1970-01-01T00:03:00 | 441        |
+| 1970-01-01T00:03:30 | 297        |
+---------------------+------------+
+-- InfluxQL: SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+------------+
+| time                | difference |
+---------------------+------------+
+| 1970-01-01T00:02:06 | 5592646    |
+| 1970-01-01T00:02:13 | -5592646   |
+| 1970-01-01T00:02:20 | 5592810    |
+| 1970-01-01T00:02:27 | 187        |
+| 1970-01-01T00:02:34 | 112        |
+| 1970-01-01T00:02:41 | -5593109   |
+| 1970-01-01T00:02:48 | 5593219    |
+| 1970-01-01T00:02:55 | 219        |
+| 1970-01-01T00:03:02 | -5593438   |
+| 1970-01-01T00:03:09 | 5593513    |
+| 1970-01-01T00:03:16 | 76         |
+| 1970-01-01T00:03:23 | -5593589   |
+| 1970-01-01T00:03:30 | 5593735    |
+---------------------+------------+
+-- InfluxQL: SELECT difference(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+------------+
+| time                | difference |
+---------------------+------------+
+| 1970-01-01T00:02:13 | 0          |
+| 1970-01-01T00:02:20 | 164        |
+| 1970-01-01T00:02:27 | 187        |
+| 1970-01-01T00:02:34 | 112        |
+| 1970-01-01T00:02:41 | 0          |
+| 1970-01-01T00:02:48 | 110        |
+| 1970-01-01T00:02:55 | 219        |
+| 1970-01-01T00:03:02 | 0          |
+| 1970-01-01T00:03:09 | 75         |
+| 1970-01-01T00:03:16 | 76         |
+| 1970-01-01T00:03:23 | 0          |
+| 1970-01-01T00:03:30 | 146        |
+---------------------+------------+
+-- InfluxQL: SELECT difference(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+---------------------+
+| time                | difference          |
+---------------------+---------------------+
+| 1970-01-01T00:02:00 | -0.7999999999999972 |
+| 1970-01-01T00:02:30 | 3.5                 |
+| 1970-01-01T00:03:00 | -0.4000000000000057 |
+---------------------+---------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+----------------------+
+| time                | difference           |
+---------------------+----------------------+
+| 1970-01-01T00:02:00 | 0.20000000000000284  |
+| 1970-01-01T00:02:30 | 0.0                  |
+| 1970-01-01T00:03:00 | -0.10000000000000853 |
+---------------------+----------------------+
 -- InfluxQL: SELECT non_negative_difference(usage_system) FROM cpu WHERE time >= 0000000060000000000 AND time < 0000000210000000001 AND cpu = 'cpu0';
 name: cpu
 +---------------------+-------------------------+
@ -202,6 +282,22 @@ tags: cpu=cpu1
 +---------------------+-------------------------+
 | 1970-01-01T00:02:00 | 0.36666666666667425     |
 +---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_difference(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+-------------------------+
+| time                | non_negative_difference |
+---------------------+-------------------------+
+| 1970-01-01T00:02:30 | 3.5                     |
+---------------------+-------------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+-------------------------+
+| time                | non_negative_difference |
+---------------------+-------------------------+
+| 1970-01-01T00:02:00 | 0.20000000000000284     |
+| 1970-01-01T00:02:30 | 0.0                     |
+---------------------+-------------------------+
 -- InfluxQL: SELECT moving_average(writes, 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
 name: diskio
 +---------------------+-------------------+
@ -307,6 +403,54 @@ name: diskio
 | 1970-01-01T00:03:23 | 5593588.0         |
 | 1970-01-01T00:03:30 | 5593662.0         |
 +---------------------+-------------------+
+-- InfluxQL: SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+-------------------+
+| time                | moving_average    |
+---------------------+-------------------+
+| 1970-01-01T00:02:27 | 5592817.666666667 |
+| 1970-01-01T00:02:34 | 5592972.0         |
+| 1970-01-01T00:02:48 | 5593108.333333333 |
+| 1970-01-01T00:02:55 | 5593255.333333333 |
+| 1970-01-01T00:03:09 | 5593390.0         |
+| 1970-01-01T00:03:16 | 5593513.333333333 |
+| 1970-01-01T00:03:30 | 5593612.333333333 |
+---------------------+-------------------+
+-- InfluxQL: SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+--------------------+
+| time                | moving_average     |
+---------------------+--------------------+
+| 1970-01-01T00:02:13 | 1864215.3333333333 |
+| 1970-01-01T00:02:20 | 3728485.3333333335 |
+| 1970-01-01T00:02:27 | 3728602.3333333335 |
+| 1970-01-01T00:02:34 | 5592972.0          |
+| 1970-01-01T00:02:41 | 3728702.0          |
+| 1970-01-01T00:02:48 | 3728776.0          |
+| 1970-01-01T00:02:55 | 3728885.6666666665 |
+| 1970-01-01T00:03:02 | 3728885.6666666665 |
+| 1970-01-01T00:03:09 | 3728983.6666666665 |
+| 1970-01-01T00:03:16 | 3729034.0          |
+| 1970-01-01T00:03:23 | 3729034.0          |
+| 1970-01-01T00:03:30 | 3729108.0          |
+---------------------+--------------------+
+-- InfluxQL: SELECT moving_average(first(writes), 3) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+-------------------+
+| time                | moving_average    |
+---------------------+-------------------+
+| 1970-01-01T00:02:20 | 5592700.666666667 |
+| 1970-01-01T00:02:27 | 5592817.666666667 |
+| 1970-01-01T00:02:34 | 5592972.0         |
+| 1970-01-01T00:02:41 | 5593071.666666667 |
+| 1970-01-01T00:02:48 | 5593145.666666667 |
+| 1970-01-01T00:02:55 | 5593255.333333333 |
+| 1970-01-01T00:03:02 | 5593365.0         |
+| 1970-01-01T00:03:09 | 5593463.0         |
+| 1970-01-01T00:03:16 | 5593513.333333333 |
+| 1970-01-01T00:03:23 | 5593563.666666667 |
+| 1970-01-01T00:03:30 | 5593637.666666667 |
+---------------------+-------------------+
 -- InfluxQL: SELECT difference(usage_idle), non_negative_difference(usage_idle), moving_average(usage_idle, 4) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY cpu;
 name: cpu
 tags: cpu=cpu0
@ -649,6 +793,166 @@ tags: cpu=cpu1
 | 1970-01-01T00:02:30 | -0.0005555555555557608 |
 | 1970-01-01T00:03:00 | -0.000555555555555524  |
 +---------------------+------------------------+
+-- InfluxQL: SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+------------+
+| time                | derivative |
+---------------------+------------+
+| 1970-01-01T00:02:20 | 82.0       |
+| 1970-01-01T00:02:27 | 187.0      |
+| 1970-01-01T00:02:34 | 112.0      |
+| 1970-01-01T00:02:48 | 55.0       |
+| 1970-01-01T00:02:55 | 219.0      |
+| 1970-01-01T00:03:09 | 37.5       |
+| 1970-01-01T00:03:16 | 76.0       |
+| 1970-01-01T00:03:30 | 73.0       |
+---------------------+------------+
+-- InfluxQL: SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+--------------------+
+| time                | derivative         |
+---------------------+--------------------+
+| 1970-01-01T00:02:20 | 5.857142857142857  |
+| 1970-01-01T00:02:27 | 13.357142857142858 |
+| 1970-01-01T00:02:34 | 8.0                |
+| 1970-01-01T00:02:48 | 3.9285714285714284 |
+| 1970-01-01T00:02:55 | 15.642857142857142 |
+| 1970-01-01T00:03:09 | 2.6785714285714284 |
+| 1970-01-01T00:03:16 | 5.428571428571429  |
+| 1970-01-01T00:03:30 | 5.214285714285714  |
+---------------------+--------------------+
+-- InfluxQL: SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+------------+
+| time                | derivative |
+---------------------+------------+
+| 1970-01-01T00:02:00 | 366.0      |
+| 1970-01-01T00:02:30 | 421.0      |
+| 1970-01-01T00:03:00 | 441.0      |
+| 1970-01-01T00:03:30 | 297.0      |
+---------------------+------------+
+-- InfluxQL: SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+-------------------+
+| time                | derivative        |
+---------------------+-------------------+
+| 1970-01-01T00:02:00 | 6.1               |
+| 1970-01-01T00:02:30 | 7.016666666666667 |
+| 1970-01-01T00:03:00 | 7.35              |
+| 1970-01-01T00:03:30 | 4.95              |
+---------------------+-------------------+
+-- InfluxQL: SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+------------+
+| time                | derivative |
+---------------------+------------+
+| 1970-01-01T00:02:06 | 5592646.0  |
+| 1970-01-01T00:02:13 | -5592646.0 |
+| 1970-01-01T00:02:20 | 5592810.0  |
+| 1970-01-01T00:02:27 | 187.0      |
+| 1970-01-01T00:02:34 | 112.0      |
+| 1970-01-01T00:02:41 | -5593109.0 |
+| 1970-01-01T00:02:48 | 5593219.0  |
+| 1970-01-01T00:02:55 | 219.0      |
+| 1970-01-01T00:03:02 | -5593438.0 |
+| 1970-01-01T00:03:09 | 5593513.0  |
+| 1970-01-01T00:03:16 | 76.0       |
+| 1970-01-01T00:03:23 | -5593589.0 |
+| 1970-01-01T00:03:30 | 5593735.0  |
+---------------------+------------+
+-- InfluxQL: SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+---------------------+
+| time                | derivative          |
+---------------------+---------------------+
+| 1970-01-01T00:02:06 | 399474.71428571426  |
+| 1970-01-01T00:02:13 | -399474.71428571426 |
+| 1970-01-01T00:02:20 | 399486.4285714286   |
+| 1970-01-01T00:02:27 | 13.357142857142858  |
+| 1970-01-01T00:02:34 | 8.0                 |
+| 1970-01-01T00:02:41 | -399507.78571428574 |
+| 1970-01-01T00:02:48 | 399515.64285714284  |
+| 1970-01-01T00:02:55 | 15.642857142857142  |
+| 1970-01-01T00:03:02 | -399531.28571428574 |
+| 1970-01-01T00:03:09 | 399536.64285714284  |
+| 1970-01-01T00:03:16 | 5.428571428571429   |
+| 1970-01-01T00:03:23 | -399542.0714285714  |
+| 1970-01-01T00:03:30 | 399552.5            |
+---------------------+---------------------+
+-- InfluxQL: SELECT derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+------------+
+| time                | derivative |
+---------------------+------------+
+| 1970-01-01T00:02:13 | 0.0        |
+| 1970-01-01T00:02:20 | 164.0      |
+| 1970-01-01T00:02:27 | 187.0      |
+| 1970-01-01T00:02:34 | 112.0      |
+| 1970-01-01T00:02:41 | 0.0        |
+| 1970-01-01T00:02:48 | 110.0      |
+| 1970-01-01T00:02:55 | 219.0      |
+| 1970-01-01T00:03:02 | 0.0        |
+| 1970-01-01T00:03:09 | 75.0       |
+| 1970-01-01T00:03:16 | 76.0       |
+| 1970-01-01T00:03:23 | 0.0        |
+| 1970-01-01T00:03:30 | 146.0      |
+---------------------+------------+
+-- InfluxQL: SELECT derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+--------------------+
+| time                | derivative         |
+---------------------+--------------------+
+| 1970-01-01T00:02:13 | 0.0                |
+| 1970-01-01T00:02:20 | 11.714285714285714 |
+| 1970-01-01T00:02:27 | 13.357142857142858 |
+| 1970-01-01T00:02:34 | 8.0                |
+| 1970-01-01T00:02:41 | 0.0                |
+| 1970-01-01T00:02:48 | 7.857142857142857  |
+| 1970-01-01T00:02:55 | 15.642857142857142 |
+| 1970-01-01T00:03:02 | 0.0                |
+| 1970-01-01T00:03:09 | 5.357142857142857  |
+| 1970-01-01T00:03:16 | 5.428571428571429  |
+| 1970-01-01T00:03:23 | 0.0                |
+| 1970-01-01T00:03:30 | 10.428571428571429 |
+---------------------+--------------------+
+-- InfluxQL: SELECT derivative(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+---------------------+
+| time                | derivative          |
+---------------------+---------------------+
+| 1970-01-01T00:02:00 | -0.7999999999999972 |
+| 1970-01-01T00:02:30 | 3.5                 |
+| 1970-01-01T00:03:00 | -0.4000000000000057 |
+---------------------+---------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+----------------------+
+| time                | derivative           |
+---------------------+----------------------+
+| 1970-01-01T00:02:00 | 0.20000000000000284  |
+| 1970-01-01T00:02:30 | 0.0                  |
+| 1970-01-01T00:03:00 | -0.10000000000000853 |
+---------------------+----------------------+
+-- InfluxQL: SELECT derivative(first(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+-----------------------+
+| time                | derivative            |
+---------------------+-----------------------+
+| 1970-01-01T00:02:00 | -0.013333333333333286 |
+| 1970-01-01T00:02:30 | 0.058333333333333334  |
+| 1970-01-01T00:03:00 | -0.006666666666666762 |
+---------------------+-----------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+------------------------+
+| time                | derivative             |
+---------------------+------------------------+
+| 1970-01-01T00:02:00 | 0.003333333333333381   |
+| 1970-01-01T00:02:30 | 0.0                    |
+| 1970-01-01T00:03:00 | -0.0016666666666668088 |
+---------------------+------------------------+
 -- InfluxQL: SELECT non_negative_derivative(writes) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
 name: diskio
 +---------------------+-------------------------+
@ -918,6 +1222,152 @@ tags: cpu=cpu1
 +---------------------+-------------------------+
 | 1970-01-01T00:02:00 | 0.006111111111111237    |
 +---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:20 | 82.0                    |
+| 1970-01-01T00:02:27 | 187.0                   |
+| 1970-01-01T00:02:34 | 112.0                   |
+| 1970-01-01T00:02:48 | 55.0                    |
+| 1970-01-01T00:02:55 | 219.0                   |
+| 1970-01-01T00:03:09 | 37.5                    |
+| 1970-01-01T00:03:16 | 76.0                    |
+| 1970-01-01T00:03:30 | 73.0                    |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:20 | 5.857142857142857       |
+| 1970-01-01T00:02:27 | 13.357142857142858      |
+| 1970-01-01T00:02:34 | 8.0                     |
+| 1970-01-01T00:02:48 | 3.9285714285714284      |
+| 1970-01-01T00:02:55 | 15.642857142857142      |
+| 1970-01-01T00:03:09 | 2.6785714285714284      |
+| 1970-01-01T00:03:16 | 5.428571428571429       |
+| 1970-01-01T00:03:30 | 5.214285714285714       |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:00 | 366.0                   |
+| 1970-01-01T00:02:30 | 421.0                   |
+| 1970-01-01T00:03:00 | 441.0                   |
+| 1970-01-01T00:03:30 | 297.0                   |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:00 | 6.1                     |
+| 1970-01-01T00:02:30 | 7.016666666666667       |
+| 1970-01-01T00:03:00 | 7.35                    |
+| 1970-01-01T00:03:30 | 4.95                    |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:06 | 5592646.0               |
+| 1970-01-01T00:02:20 | 5592810.0               |
+| 1970-01-01T00:02:27 | 187.0                   |
+| 1970-01-01T00:02:34 | 112.0                   |
+| 1970-01-01T00:02:48 | 5593219.0               |
+| 1970-01-01T00:02:55 | 219.0                   |
+| 1970-01-01T00:03:09 | 5593513.0               |
+| 1970-01-01T00:03:16 | 76.0                    |
+| 1970-01-01T00:03:30 | 5593735.0               |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:06 | 399474.71428571426      |
+| 1970-01-01T00:02:20 | 399486.4285714286       |
+| 1970-01-01T00:02:27 | 13.357142857142858      |
+| 1970-01-01T00:02:34 | 8.0                     |
+| 1970-01-01T00:02:48 | 399515.64285714284      |
+| 1970-01-01T00:02:55 | 15.642857142857142      |
+| 1970-01-01T00:03:09 | 399536.64285714284      |
+| 1970-01-01T00:03:16 | 5.428571428571429       |
+| 1970-01-01T00:03:30 | 399552.5                |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:13 | 0.0                     |
+| 1970-01-01T00:02:20 | 164.0                   |
+| 1970-01-01T00:02:27 | 187.0                   |
+| 1970-01-01T00:02:34 | 112.0                   |
+| 1970-01-01T00:02:41 | 0.0                     |
+| 1970-01-01T00:02:48 | 110.0                   |
+| 1970-01-01T00:02:55 | 219.0                   |
+| 1970-01-01T00:03:02 | 0.0                     |
+| 1970-01-01T00:03:09 | 75.0                    |
+| 1970-01-01T00:03:16 | 76.0                    |
+| 1970-01-01T00:03:23 | 0.0                     |
+| 1970-01-01T00:03:30 | 146.0                   |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(writes), 500ms) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:13 | 0.0                     |
+| 1970-01-01T00:02:20 | 11.714285714285714      |
+| 1970-01-01T00:02:27 | 13.357142857142858      |
+| 1970-01-01T00:02:34 | 8.0                     |
+| 1970-01-01T00:02:41 | 0.0                     |
+| 1970-01-01T00:02:48 | 7.857142857142857       |
+| 1970-01-01T00:02:55 | 15.642857142857142      |
+| 1970-01-01T00:03:02 | 0.0                     |
+| 1970-01-01T00:03:09 | 5.357142857142857       |
+| 1970-01-01T00:03:16 | 5.428571428571429       |
+| 1970-01-01T00:03:23 | 0.0                     |
+| 1970-01-01T00:03:30 | 10.428571428571429      |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:30 | 3.5                     |
+---------------------+-------------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:00 | 0.20000000000000284     |
+| 1970-01-01T00:02:30 | 0.0                     |
+---------------------+-------------------------+
+-- InfluxQL: SELECT non_negative_derivative(first(usage_idle), 500ms) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:30 | 0.058333333333333334    |
+---------------------+-------------------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+-------------------------+
+| time                | non_negative_derivative |
+---------------------+-------------------------+
+| 1970-01-01T00:02:00 | 0.003333333333333381    |
+| 1970-01-01T00:02:30 | 0.0                     |
+---------------------+-------------------------+
 -- InfluxQL: SELECT cumulative_sum(writes) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001;
 name: diskio
 +---------------------+----------------+
@ -1093,4 +1543,86 @@ tags: cpu=cpu1
 | 1970-01-01T00:02:00 | 99.85              |
 | 1970-01-01T00:02:30 | 199.68333333333334 |
 | 1970-01-01T00:03:00 | 299.48333333333335 |
-+---------------------+--------------------+
+---------------------+--------------------+
+-- InfluxQL: SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s);
+name: diskio
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:06 | 5592646        |
+| 1970-01-01T00:02:20 | 11185456       |
+| 1970-01-01T00:02:27 | 16778453       |
+| 1970-01-01T00:02:34 | 22371562       |
+| 1970-01-01T00:02:48 | 27964781       |
+| 1970-01-01T00:02:55 | 33558219       |
+| 1970-01-01T00:03:09 | 39151732       |
+| 1970-01-01T00:03:16 | 44745321       |
+| 1970-01-01T00:03:30 | 50339056       |
+---------------------+----------------+
+-- InfluxQL: SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(30s);
+name: diskio
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:00 | 5592646        |
+| 1970-01-01T00:02:30 | 11185643       |
+| 1970-01-01T00:03:00 | 16779081       |
+| 1970-01-01T00:03:30 | 22372816       |
+---------------------+----------------+
+-- InfluxQL: SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(0);
+name: diskio
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:06 | 5592646        |
+| 1970-01-01T00:02:13 | 5592646        |
+| 1970-01-01T00:02:20 | 11185456       |
+| 1970-01-01T00:02:27 | 16778453       |
+| 1970-01-01T00:02:34 | 22371562       |
+| 1970-01-01T00:02:41 | 22371562       |
+| 1970-01-01T00:02:48 | 27964781       |
+| 1970-01-01T00:02:55 | 33558219       |
+| 1970-01-01T00:03:02 | 33558219       |
+| 1970-01-01T00:03:09 | 39151732       |
+| 1970-01-01T00:03:16 | 44745321       |
+| 1970-01-01T00:03:23 | 44745321       |
+| 1970-01-01T00:03:30 | 50339056       |
+---------------------+----------------+
+-- InfluxQL: SELECT cumulative_sum(first(writes)) FROM diskio WHERE time >= 0000000130000000000 AND time < 0000000210000000001 GROUP BY time(7s) fill(previous);
+name: diskio
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:06 | 5592646        |
+| 1970-01-01T00:02:13 | 11185292       |
+| 1970-01-01T00:02:20 | 16778102       |
+| 1970-01-01T00:02:27 | 22371099       |
+| 1970-01-01T00:02:34 | 27964208       |
+| 1970-01-01T00:02:41 | 33557317       |
+| 1970-01-01T00:02:48 | 39150536       |
+| 1970-01-01T00:02:55 | 44743974       |
+| 1970-01-01T00:03:02 | 50337412       |
+| 1970-01-01T00:03:09 | 55930925       |
+| 1970-01-01T00:03:16 | 61524514       |
+| 1970-01-01T00:03:23 | 67118103       |
+| 1970-01-01T00:03:30 | 72711838       |
+---------------------+----------------+
+-- InfluxQL: SELECT cumulative_sum(first(usage_idle)) FROM cpu WHERE time >= 0000000130000000000 AND time < 0000000210000000001 AND cpu =~ /^cpu(0|1)$/ GROUP BY TIME(30s), cpu;
+name: cpu
+tags: cpu=cpu0
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:00 | 89.8           |
+| 1970-01-01T00:02:30 | 180.2          |
+| 1970-01-01T00:03:00 | 270.2          |
+---------------------+----------------+
+name: cpu
+tags: cpu=cpu1
+---------------------+----------------+
+| time                | cumulative_sum |
+---------------------+----------------+
+| 1970-01-01T00:02:00 | 99.8           |
+| 1970-01-01T00:02:30 | 199.7          |
+| 1970-01-01T00:03:00 | 299.5          |
+---------------------+----------------+
--- a/ingester/src/buffer_tree/partition/resolver/cache.rs
+++ b/ingester/src/buffer_tree/partition/resolver/cache.rs
@ -2,9 +2,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};

 use async_trait::async_trait;
 use backoff::BackoffConfig;
-use data_types::{
-    NamespaceId, Partition, PartitionHashId, PartitionId, PartitionKey, SequenceNumber, TableId,
-};
+use data_types::{NamespaceId, Partition, PartitionHashId, PartitionId, PartitionKey, TableId};
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
@ -222,6 +220,7 @@ mod tests {
    // Harmless in tests - saves a bunch of extra vars.
    #![allow(clippy::await_holding_lock)]

+    use data_types::PartitionId;
    use iox_catalog::mem::MemCatalog;

    use super::*;
--- a/ingester/src/buffer_tree/partition/resolver/coalesce.rs
+++ b/ingester/src/buffer_tree/partition/resolver/coalesce.rs
@ -6,7 +6,6 @@ use std::{
    },
 };

-use arrow::compute::kernels::partition;
 use async_trait::async_trait;
 use data_types::{NamespaceId, PartitionKey, TableId};
 use futures::{future::Shared, FutureExt};
@ -25,11 +24,10 @@ use super::PartitionProvider;
 type BoxedResolveFuture =
    Pin<Box<dyn std::future::Future<Output = Arc<Mutex<PartitionData>>> + Send>>;

-/// A compound key of `(namespace, table, partition_key)` which uniquely
+/// A compound key of `(table, partition_key)` which uniquely
 /// identifies a single partition.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct Key {
-    namespace_id: NamespaceId,
    table_id: TableId,
    partition_key: PartitionKey,
 }
@ -149,7 +147,6 @@ where
        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        let key = Key {
-            namespace_id,
            table_id,
            partition_key: partition_key.clone(), // Ref-counted anyway!
        };
@ -267,12 +264,11 @@ mod tests {
    use assert_matches::assert_matches;
    use futures::Future;
    use futures::{stream::FuturesUnordered, StreamExt};
-    use lazy_static::lazy_static;
    use test_helpers::timeout::FutureTimeout;
    use tokio::sync::{Notify, Semaphore};

    use crate::{
-        buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState},
+        buffer_tree::partition::resolver::mock::MockPartitionProvider,
        test_util::{
            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_ID,
--- a/ingester/src/buffer_tree/partition/resolver/mod.rs
+++ b/ingester/src/buffer_tree/partition/resolver/mod.rs
@ -2,8 +2,6 @@
 //!
 //! [`PartitionData`]: crate::buffer_tree::partition::PartitionData

-#![allow(unused_imports)] // Transition time only.
-
 mod cache;
 pub(crate) use cache::*;

--- a/ingester/src/buffer_tree/partition/resolver/trait.rs
+++ b/ingester/src/buffer_tree/partition/resolver/trait.rs
@ -49,11 +49,11 @@ where

 #[cfg(test)]
 mod tests {
-    use std::{sync::Arc, time::Duration};
+    use std::sync::Arc;

    use super::*;
    use crate::{
-        buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState},
+        buffer_tree::partition::resolver::mock::MockPartitionProvider,
        test_util::{
            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_PARTITION_KEY,
--- a/ingester/src/buffer_tree/root.rs
+++ b/ingester/src/buffer_tree/root.rs
@ -998,12 +998,8 @@ mod tests {
        assert_eq!(m, 1, "tables counter mismatch");
    }

-    /// Assert that multiple writes to a single namespace/table results in a
-    /// single namespace being created, and matching metrics.
    #[tokio::test]
    async fn test_partition_iter() {
-        // Configure the mock partition provider to return a single partition, named
-        // p1.
        let partition_provider = Arc::new(
            MockPartitionProvider::default()
                .with_partition(
--- a/iox_query_influxql/src/plan/planner.rs
+++ b/iox_query_influxql/src/plan/planner.rs
@ -1323,18 +1323,25 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
            _ => None,
        };

+        // Some aggregates, such as COUNT, should be filled with zero by default
+        // rather than NULL.
+        let should_zero_fill_expr = fields
+            .iter()
+            .map(is_zero_filled_aggregate_field)
+            .collect::<Vec<_>>();
+
        // Rewrite the aggregate columns from the projection, so that the expressions
        // refer to the columns from the aggregate projection
        let select_exprs_post_aggr = select_exprs
            .iter()
-            .zip(should_fill_expr)
-            .map(|(expr, should_fill)| {
+            .zip(should_fill_expr.iter().zip(should_zero_fill_expr))
+            .map(|(expr, (should_fill, should_zero_fill))| {
                // This implements the `FILL(<value>)` strategy, by coalescing any aggregate
                // expressions to `<value>` when they are `NULL`.
-                let fill_if_null = if fill_if_null.is_some() && should_fill {
-                    fill_if_null
-                } else {
-                    None
+                let fill_if_null = match (fill_if_null, should_fill, should_zero_fill) {
+                    (Some(_), true, _) => fill_if_null,
+                    (None, true, true) => Some(0.into()),
+                    _ => None,
                };

                rebase_expr(expr, &aggr_projection_exprs, &fill_if_null, &plan)
@ -3081,6 +3088,16 @@ fn is_aggregate_field(f: &Field) -> bool {
    .is_break()
 }

+/// A utility function that checks whether `f` is an aggregate field
+/// that should be filled with a 0 rather than an NULL.
+fn is_zero_filled_aggregate_field(f: &Field) -> bool {
+    walk_expr(&f.expr, &mut |e| match e {
+        IQLExpr::Call(Call { name, .. }) if name == "count" => ControlFlow::Break(()),
+        _ => ControlFlow::Continue(()),
+    })
+    .is_break()
+}
+
 fn conditional_op_to_operator(op: ConditionalOperator) -> Result<Operator> {
    match op {
        ConditionalOperator::Eq => Ok(Operator::Eq),
@ -4018,6 +4035,19 @@ mod test {
                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                "###);
+
+                // selector
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DERIVATIVE(LAST(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                    Filter: NOT non_negative_derivative IS NULL [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                      Projection: time, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) AS non_negative_derivative [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_derivative((selector_last(cpu.usage_idle,cpu.time))[value], IntervalMonthDayNano("10000000000"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value])]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
            }

            #[test]
@ -4078,7 +4108,7 @@ mod test {
            "###);
            assert_snapshot!(plan("SELECT COUNT(DISTINCT usage_idle) FROM cpu"), @r###"
            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, COUNT(DISTINCT cpu.usage_idle) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(DISTINCT cpu.usage_idle), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
                Aggregate: groupBy=[[]], aggr=[[COUNT(DISTINCT cpu.usage_idle)]] [COUNT(DISTINCT cpu.usage_idle):Int64;N]
                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
            "###);
@ -4149,7 +4179,7 @@ mod test {
            fn test_selectors_and_aggregate() {
                assert_snapshot!(plan("SELECT LAST(usage_idle), COUNT(usage_idle) FROM cpu"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last, COUNT(cpu.usage_idle) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last, coalesce_struct(COUNT(cpu.usage_idle), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, count:Int64;N]
                    Aggregate: groupBy=[[]], aggr=[[selector_last(cpu.usage_idle, cpu.time), COUNT(cpu.usage_idle)]] [selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, COUNT(cpu.usage_idle):Int64;N]
                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                "###);
@ -4828,20 +4858,20 @@ mod test {
            fn no_group_by() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data"), @r###"
            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
                Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
            "###);

                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY non_existent"), @r###"
            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
                Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
            "###);
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo"), @r###"
            Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
                Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
            "###);
@ -4849,7 +4879,7 @@ mod test {
                // The `COUNT(f64_field)` aggregate is only projected ones in the Aggregate and reused in the projection
                assert_snapshot!(plan("SELECT COUNT(f64_field), COUNT(f64_field) + COUNT(f64_field), COUNT(f64_field) * 3 FROM data"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_count:Int64;N, count_1:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, COUNT(data.f64_field) AS count, COUNT(data.f64_field) + COUNT(data.f64_field) AS count_count, COUNT(data.f64_field) * Int64(3) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_count:Int64;N, count_1:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count, coalesce_struct(COUNT(data.f64_field), Int64(0)) + coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count_count, coalesce_struct(COUNT(data.f64_field), Int64(0)) * Int64(3) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_count:Int64;N, count_1:Int64;N]
                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);
@ -4857,7 +4887,7 @@ mod test {
                // non-existent tags are excluded from the Aggregate groupBy and Sort operators
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo, non_existent"), @r###"
            Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
                Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
            "###);
@ -4865,7 +4895,7 @@ mod test {
                // Aggregate expression is projected once and reused in final projection
                assert_snapshot!(plan("SELECT COUNT(f64_field),  COUNT(f64_field) * 2 FROM data"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_1:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, COUNT(data.f64_field) AS count, COUNT(data.f64_field) * Int64(2) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_1:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count, coalesce_struct(COUNT(data.f64_field), Int64(0)) * Int64(2) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_1:Int64;N]
                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);
@ -4904,7 +4934,7 @@ mod test {
            fn group_by_time() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(none)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                        TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4913,7 +4943,7 @@ mod test {
                // supports offset parameter
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s, 5s) FILL(none)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(5000000000, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                        TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4925,7 +4955,7 @@ mod test {
                // No time bounds
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4938,7 +4968,7 @@ mod test {
                // No lower time bounds
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4951,7 +4981,7 @@ mod test {
                // No upper time bounds
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' GROUP BY TIME(10s)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None)))..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4964,7 +4994,7 @@ mod test {
                // Default is FILL(null)
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None)))..Included(Literal(TimestampNanosecond(1667181719999999999, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4976,7 +5006,7 @@ mod test {
            fn group_by_time_gapfill_default_is_fill_null1() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -4988,7 +5018,7 @@ mod test {
            fn group_by_time_gapfill_default_is_fill_null2() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -5000,7 +5030,7 @@ mod test {
            fn group_by_time_gapfill_default_is_fill_null3() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(previous)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[LOCF(COUNT(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -5024,7 +5054,7 @@ mod test {
            fn group_by_time_gapfill_default_is_fill_null5() {
                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(linear)"), @r###"
                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                    GapFill: groupBy=[time], aggr=[[INTERPOLATE(COUNT(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -5066,7 +5096,7 @@ mod test {
                    Filter: iox::row <= Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);
@ -5080,7 +5110,7 @@ mod test {
                    Filter: iox::row > Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);
@ -5094,7 +5124,7 @@ mod test {
                    Filter: iox::row BETWEEN Int64(4) AND Int64(5) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                "###);
@ -5120,7 +5150,7 @@ mod test {
                fn group_by_time_precision() {
                    assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10u) FILL(none)"), @r###"
                    Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                      Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, COUNT(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                      Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                        Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
                          Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@ -5378,7 +5408,7 @@ mod test {
            "###);
            assert_snapshot!(plan("SELECT count(foo) as foo, first(usage_idle) from cpu group by foo"), @r###"
            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, foo_1:Null;N, first:Float64;N]
-              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS foo, (selector_first(cpu.usage_idle,cpu.time,NULL))[other_1] AS foo_1, (selector_first(cpu.usage_idle,cpu.time,NULL))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, foo_1:Null;N, first:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS foo, (coalesce_struct(selector_first(cpu.usage_idle,cpu.time,NULL), Struct({value:Float64(0),time:TimestampNanosecond(0, None),other_1:NULL})))[other_1] AS foo_1, (selector_first(cpu.usage_idle,cpu.time,NULL))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, foo_1:Null;N, first:Float64;N]
                Aggregate: groupBy=[[]], aggr=[[selector_first(cpu.usage_idle, cpu.time, NULL)]] [selector_first(cpu.usage_idle,cpu.time,NULL):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
            "###);
--- a/iox_query_influxql/src/plan/rewriter.rs
+++ b/iox_query_influxql/src/plan/rewriter.rs
@ -1029,7 +1029,7 @@ impl FieldChecker {
            ProjectionType::TopBottomSelector
        } else if self.has_group_by_time {
            if self.window_count > 0 {
-                if self.window_count == self.aggregate_count {
+                if self.window_count == self.aggregate_count + self.selector_count {
                    ProjectionType::WindowAggregate
                } else {
                    ProjectionType::WindowAggregateMixed
--- a/iox_query_influxql/src/plan/util.rs
+++ b/iox_query_influxql/src/plan/util.rs
@ -118,6 +118,7 @@ fn number_to_scalar(n: &Number, data_type: &DataType) -> Result<ScalarValue> {
            ),
            fields.clone(),
        ),
+        (_, DataType::Null) => ScalarValue::Null,
        (n, data_type) => {
            // The only output data types expected are Int64, Float64 or UInt64
            return error::internal(format!("no conversion from {n} to {data_type}"));
--- a/querier/src/cache/partition.rs
+++ b/querier/src/cache/partition.rs
@ -8,7 +8,11 @@ use cache_system::{
        PolicyBackend,
    },
    cache::{driver::CacheDriver, metrics::CacheWithMetrics, Cache},
-    loader::{metrics::MetricsLoader, FunctionLoader},
+    loader::{
+        batch::{BatchLoader, BatchLoaderFlusher, BatchLoaderFlusherExt},
+        metrics::MetricsLoader,
+        FunctionLoader,
+    },
    resource_consumption::FunctionEstimator,
 };
 use data_types::{
@ -16,17 +20,17 @@ use data_types::{
    ColumnId, Partition, PartitionId, TransitionPartitionId,
 };
 use datafusion::scalar::ScalarValue;
-use iox_catalog::{interface::Catalog, partition_lookup};
+use iox_catalog::{interface::Catalog, partition_lookup_batch};
 use iox_query::chunk_statistics::{ColumnRange, ColumnRanges};
 use iox_time::TimeProvider;
 use observability_deps::tracing::debug;
 use schema::sort::SortKey;
 use std::{
-    collections::{HashMap, HashSet},
+    collections::{hash_map::Entry, HashMap, HashSet},
    mem::{size_of, size_of_val},
    sync::Arc,
 };
-use trace::span::Span;
+use trace::span::{Span, SpanRecorder};

 use super::{namespace::CachedTable, ram::RamSize};

@ -46,6 +50,7 @@ type CacheT = Box<
 pub struct PartitionCache {
    cache: CacheT,
    remove_if_handle: RemoveIfHandle<PartitionId, Option<CachedPartition>>,
+    flusher: Arc<dyn BatchLoaderFlusher>,
 }

 impl PartitionCache {
@ -58,24 +63,59 @@ impl PartitionCache {
        ram_pool: Arc<ResourcePool<RamSize>>,
        testing: bool,
    ) -> Self {
-        let loader =
-            FunctionLoader::new(move |partition_id: PartitionId, extra: Arc<CachedTable>| {
+        let loader = FunctionLoader::new(
+            move |partition_ids: Vec<PartitionId>, cached_tables: Vec<Arc<CachedTable>>| {
+                // sanity checks
+                assert_eq!(partition_ids.len(), cached_tables.len());
+
                let catalog = Arc::clone(&catalog);
                let backoff_config = backoff_config.clone();

                async move {
-                    let partition = Backoff::new(&backoff_config)
+                    // prepare output buffer
+                    let mut out = (0..partition_ids.len()).map(|_| None).collect::<Vec<_>>();
+                    let mut out_map =
+                        HashMap::<PartitionId, usize>::with_capacity(partition_ids.len());
+                    for (idx, id) in partition_ids.iter().enumerate() {
+                        match out_map.entry(*id) {
+                            Entry::Occupied(_) => unreachable!("cache system requested same partition from loader concurrently, this should have been prevented by the CacheDriver"),
+                            Entry::Vacant(v) => {
+                                v.insert(idx);
+                            }
+                        }
+                    }
+
+                    // build `&[&TransitionPartitionId]` for batch catalog request
+                    let ids = partition_ids
+                        .iter()
+                        .copied()
+                        .map(TransitionPartitionId::Deprecated)
+                        .collect::<Vec<_>>();
+                    let ids = ids.iter().collect::<Vec<_>>();
+
+                    // fetch catalog data
+                    let partitions = Backoff::new(&backoff_config)
                        .retry_all_errors("get partition_key", || async {
                            let mut repos = catalog.repositories().await;
-                            let id = TransitionPartitionId::Deprecated(partition_id);
-                            partition_lookup(repos.as_mut(), &id).await
+                            partition_lookup_batch(repos.as_mut(), &ids).await
                        })
                        .await
-                        .expect("retry forever")?;
+                        .expect("retry forever");

-                    Some(CachedPartition::new(partition, &extra))
+                    // build output
+                    for p in partitions {
+                        let idx = out_map[&p.id];
+                        let cached_table = &cached_tables[idx];
+                        let p = CachedPartition::new(p, cached_table);
+                        out[idx] = Some(p);
+                    }
+
+                    out
                }
-            });
+            },
+        );
+        let loader = Arc::new(BatchLoader::new(loader));
+        let flusher = Arc::clone(&loader);
        let loader = Arc::new(MetricsLoader::new(
            loader,
            CACHE_ID,
@ -111,51 +151,79 @@ impl PartitionCache {
        Self {
            cache,
            remove_if_handle,
+            flusher,
        }
    }

    /// Get cached partition.
    ///
+    /// The result only contains existing partitions. The order is undefined.
+    ///
    /// Expire partition if the cached sort key does NOT cover the given set of columns.
    pub async fn get(
        &self,
        cached_table: Arc<CachedTable>,
-        partition_id: PartitionId,
-        sort_key_should_cover: &[ColumnId],
+        partitions: Vec<PartitionRequest>,
        span: Option<Span>,
-    ) -> Option<CachedPartition> {
-        self.remove_if_handle
-            .remove_if_and_get(
-                &self.cache,
-                partition_id,
-                |cached_partition| {
-                    let invalidates =
-                        if let Some(sort_key) = &cached_partition.and_then(|p| p.sort_key) {
-                            sort_key_should_cover
-                                .iter()
-                                .any(|col| !sort_key.column_set.contains(col))
-                        } else {
-                            // no sort key at all => need to update if there is anything to cover
-                            !sort_key_should_cover.is_empty()
-                        };
+    ) -> Vec<CachedPartition> {
+        let span_recorder = SpanRecorder::new(span);

-                    if invalidates {
-                        debug!(
-                            partition_id = partition_id.get(),
-                            "invalidate partition cache",
-                        );
-                    }
+        let futures = partitions
+            .into_iter()
+            .map(
+                |PartitionRequest {
+                     partition_id,
+                     sort_key_should_cover,
+                 }| {
+                    let cached_table = Arc::clone(&cached_table);
+                    let span = span_recorder.child_span("single partition cache lookup");

-                    invalidates
+                    self.remove_if_handle.remove_if_and_get(
+                        &self.cache,
+                        partition_id,
+                        move |cached_partition| {
+                            let invalidates = if let Some(sort_key) =
+                                &cached_partition.and_then(|p| p.sort_key)
+                            {
+                                sort_key_should_cover
+                                    .iter()
+                                    .any(|col| !sort_key.column_set.contains(col))
+                            } else {
+                                // no sort key at all => need to update if there is anything to cover
+                                !sort_key_should_cover.is_empty()
+                            };
+
+                            if invalidates {
+                                debug!(
+                                    partition_id = partition_id.get(),
+                                    "invalidate partition cache",
+                                );
+                            }
+
+                            invalidates
+                        },
+                        (cached_table, span),
+                    )
                },
-                (cached_table, span),
            )
-            .await
+            .collect();
+
+        let res = self.flusher.auto_flush(futures).await;
+
+        res.into_iter().flatten().collect()
    }
 }

+/// Request for [`PartitionCache::get`].
+#[derive(Debug)]
+pub struct PartitionRequest {
+    pub partition_id: PartitionId,
+    pub sort_key_should_cover: Vec<ColumnId>,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CachedPartition {
+    pub id: PartitionId,
    pub sort_key: Option<Arc<PartitionSortKey>>,
    pub column_ranges: ColumnRanges,
 }
@ -231,6 +299,7 @@ impl CachedPartition {
        column_ranges.shrink_to_fit();

        Self {
+            id: partition.id,
            sort_key,
            column_ranges: Arc::new(column_ranges),
        }
@ -298,12 +367,15 @@ mod tests {
    use crate::cache::{
        ram::test_util::test_ram_pool, test_util::assert_catalog_access_metric_count,
    };
+    use async_trait::async_trait;
    use data_types::{partition_template::TablePartitionTemplateOverride, ColumnType};
+    use futures::StreamExt;
    use generated_types::influxdata::iox::partition_template::v1::{
        template_part::Part, PartitionTemplate, TemplatePart,
    };
-    use iox_tests::TestCatalog;
+    use iox_tests::{TestCatalog, TestNamespace};
    use schema::{Schema, SchemaBuilder};
+    use tokio::sync::Barrier;

    #[tokio::test]
    async fn test_sort_key() {
@ -348,7 +420,7 @@ mod tests {
        );

        let sort_key1a = cache
-            .get(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
@ -360,18 +432,26 @@ mod tests {
                column_order: [c1.column.id, c2.column.id].into(),
            }
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 1);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );

        let sort_key2 = cache
-            .get(Arc::clone(&cached_table), p2.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), p2.id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key2, None);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 2);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            2,
+        );

        let sort_key1b = cache
-            .get(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
+            .get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
            .await
            .unwrap()
            .sort_key;
@ -379,12 +459,16 @@ mod tests {
            sort_key1a.as_ref().unwrap(),
            sort_key1b.as_ref().unwrap()
        ));
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 2);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            2,
+        );

        // non-existing partition
        for _ in 0..2 {
            let res = cache
-                .get(
+                .get_one(
                    Arc::clone(&cached_table),
                    PartitionId::new(i64::MAX),
                    &Vec::new(),
@ -392,7 +476,11 @@ mod tests {
                )
                .await;
            assert_eq!(res, None);
-            assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 3);
+            assert_catalog_access_metric_count(
+                &catalog.metric_registry,
+                "partition_get_by_id_batch",
+                3,
+            );
        }
    }

@ -461,7 +549,7 @@ mod tests {
        );

        let ranges1a = cache
-            .get(Arc::clone(&cached_table), p1.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p1.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -488,10 +576,14 @@ mod tests {
            &ranges1a.get("tag1").unwrap().min_value,
            &ranges1a.get("tag1").unwrap().max_value,
        ));
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 1);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );

        let ranges2 = cache
-            .get(Arc::clone(&cached_table), p2.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p2.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -505,10 +597,14 @@ mod tests {
                }
            ),]),
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 2);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            2,
+        );

        let ranges3 = cache
-            .get(Arc::clone(&cached_table), p3.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p3.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -531,10 +627,14 @@ mod tests {
                ),
            ]),
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 3);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            3,
+        );

        let ranges4 = cache
-            .get(Arc::clone(&cached_table), p4.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p4.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -557,10 +657,14 @@ mod tests {
                ),
            ]),
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            4,
+        );

        let ranges5 = cache
-            .get(Arc::clone(&cached_table), p5.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p5.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
@ -574,20 +678,28 @@ mod tests {
                }
            ),]),
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            5,
+        );

        let ranges1b = cache
-            .get(Arc::clone(&cached_table), p1.id, &[], None)
+            .get_one(Arc::clone(&cached_table), p1.id, &[], None)
            .await
            .unwrap()
            .column_ranges;
        assert!(Arc::ptr_eq(&ranges1a, &ranges1b));
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            5,
+        );

        // non-existing partition
        for _ in 0..2 {
            let res = cache
-                .get(
+                .get_one(
                    Arc::clone(&cached_table),
                    PartitionId::new(i64::MAX),
                    &[],
@ -595,7 +707,11 @@ mod tests {
                )
                .await;
            assert_eq!(res, None);
-            assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 6);
+            assert_catalog_access_metric_count(
+                &catalog.metric_registry,
+                "partition_get_by_id_batch",
+                6,
+            );
        }
    }

@ -635,31 +751,43 @@ mod tests {
        );

        let sort_key = cache
-            .get(Arc::clone(&cached_table), p_id, &[], None)
+            .get_one(Arc::clone(&cached_table), p_id, &[], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 1);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );

        // requesting nother will not expire
        assert!(p_sort_key.is_none());
        let sort_key = cache
-            .get(Arc::clone(&cached_table), p_id, &[], None)
+            .get_one(Arc::clone(&cached_table), p_id, &[], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 1);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );

        // but requesting something will expire
        let sort_key = cache
-            .get(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
+            .get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
            .await
            .unwrap()
            .sort_key;
        assert_eq!(sort_key, None,);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 2);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            2,
+        );

        // set sort key
        let p = p
@ -668,11 +796,12 @@ mod tests {
                c2.column.name.as_str(),
            ]))
            .await;
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 1);

        // expire & fetch
        let p_sort_key = p.partition.sort_key();
        let sort_key = cache
-            .get(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
+            .get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
            .await
            .unwrap()
            .sort_key;
@ -684,7 +813,11 @@ mod tests {
                column_order: [c1.column.id, c2.column.id].into(),
            }
        );
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            3,
+        );

        // subsets and the full key don't expire
        for should_cover in [
@ -694,7 +827,7 @@ mod tests {
            vec![c1.column.id, c2.column.id],
        ] {
            let sort_key_2 = cache
-                .get(Arc::clone(&cached_table), p_id, &should_cover, None)
+                .get_one(Arc::clone(&cached_table), p_id, &should_cover, None)
                .await
                .unwrap()
                .sort_key;
@ -702,13 +835,17 @@ mod tests {
                sort_key.as_ref().unwrap(),
                sort_key_2.as_ref().unwrap()
            ));
-            assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
+            assert_catalog_access_metric_count(
+                &catalog.metric_registry,
+                "partition_get_by_id_batch",
+                3,
+            );
        }

        // unknown columns expire
        let c3 = t.create_column("x", ColumnType::Tag).await;
        let sort_key_2 = cache
-            .get(
+            .get_one(
                Arc::clone(&cached_table),
                p_id,
                &[c1.column.id, c3.column.id],
@ -722,10 +859,259 @@ mod tests {
            sort_key_2.as_ref().unwrap()
        ));
        assert_eq!(sort_key, sort_key_2);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            4,
+        );
+    }
+
+    #[tokio::test]
+    async fn test_multi_get() {
+        let catalog = TestCatalog::new();
+
+        let ns = catalog.create_namespace_1hr_retention("ns").await;
+        let t = ns.create_table("table").await;
+        let p1 = t.create_partition("k1").await.partition.clone();
+        let p2 = t.create_partition("k2").await.partition.clone();
+        let cached_table = Arc::new(CachedTable {
+            id: t.table.id,
+            schema: schema(),
+            column_id_map: HashMap::default(),
+            column_id_map_rev: HashMap::default(),
+            primary_key_column_ids: [].into(),
+            partition_template: TablePartitionTemplateOverride::default(),
+        });
+
+        let cache = PartitionCache::new(
+            catalog.catalog(),
+            BackoffConfig::default(),
+            catalog.time_provider(),
+            &catalog.metric_registry(),
+            test_ram_pool(),
+            true,
+        );
+
+        let mut res = cache
+            .get(
+                Arc::clone(&cached_table),
+                vec![
+                    PartitionRequest {
+                        partition_id: p1.id,
+                        sort_key_should_cover: vec![],
+                    },
+                    PartitionRequest {
+                        partition_id: p2.id,
+                        sort_key_should_cover: vec![],
+                    },
+                    PartitionRequest {
+                        partition_id: p1.id,
+                        sort_key_should_cover: vec![],
+                    },
+                    PartitionRequest {
+                        // requesting non-existing partitions is fine, they just don't appear in the output
+                        partition_id: PartitionId::new(i64::MAX),
+                        sort_key_should_cover: vec![],
+                    },
+                ],
+                None,
+            )
+            .await;
+        res.sort_by_key(|p| p.id);
+        let ids = res.iter().map(|p| p.id).collect::<Vec<_>>();
+        assert_eq!(ids, vec![p1.id, p1.id, p2.id]);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );
+
+        // empty get
+        let res = cache.get(Arc::clone(&cached_table), vec![], None).await;
+        assert_eq!(res, vec![]);
+    }
+
+    /// This is a regression test for <https://github.com/influxdata/influxdb_iox/issues/8286>.
+    ///
+    /// The issue happened when requests for multiple (different) tables were made concurrently. The root cause was the
+    /// wrong assumption that when flushing the batched up requests, there would only be a single table in the flushed set.
+    ///
+    /// To trigger this, we need at least 2 tokio threads.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_multi_table_concurrent_get() {
+        // In most cases, the issue triggers on the first run. However let's be sure and try multiple times.
+        for _ in 0..10 {
+            test_multi_table_concurrent_get_inner().await;
+        }
+    }
+
+    /// Actually implementation of [`test_multi_table_concurrent_get`] that is tried multiple times.
+    async fn test_multi_table_concurrent_get_inner() {
+        let catalog = TestCatalog::new();
+
+        // prepare catalog state for two tables
+        let ns = catalog.create_namespace_1hr_retention("ns").await;
+        let state_1 = ConcurrencyTestState::prepare(&ns, "t1").await;
+        let state_2 = ConcurrencyTestState::prepare(&ns, "t2").await;
+
+        // sanity checks for test setup
+        assert!(!Arc::ptr_eq(&state_1.cached_table, &state_2.cached_table));
+        assert_ne!(state_1.cached_table.id, state_2.cached_table.id);
+        assert_ne!(state_1.c_id, state_2.c_id);
+        assert_ne!(state_1.partitions, state_2.partitions);
+
+        let cache = Arc::new(PartitionCache::new(
+            catalog.catalog(),
+            BackoffConfig::default(),
+            catalog.time_provider(),
+            &catalog.metric_registry(),
+            test_ram_pool(),
+            true,
+        ));
+
+        // use a barrier to make sure that both tokio tasks are running at the same time
+        let barrier = Arc::new(Barrier::new(2));
+
+        // set up first tokio task
+        let barrier_captured = Arc::clone(&barrier);
+        let cache_captured = Arc::clone(&cache);
+        let handle_1 = tokio::spawn(async move {
+            barrier_captured.wait().await;
+
+            // When running quickly, both tasks will end up on the same tokio worker and will run in sequence. It seems
+            // that tokio tries to avoid costly work-stealing. However we can trick tokio into actually running both
+            // task concurrently with a bit more async work: a simple sleep.
+            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+
+            state_1.run(cache_captured).await;
+        });
+
+        // set up 2nd tokio tasks in a same manner as the first one (but for the other table)
+        let barrier_captured = Arc::clone(&barrier);
+        let cache_captured = Arc::clone(&cache);
+        let handle_2 = tokio::spawn(async move {
+            barrier_captured.wait().await;
+            tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+            state_2.run(cache_captured).await;
+        });
+
+        handle_1.await.unwrap();
+        handle_2.await.unwrap();
+    }
+
+    /// Building block for a single table within the [`test_multi_table_concurrent_get`] test.
+    struct ConcurrencyTestState {
+        /// Cached table that is used for [`PartitionCache::get`].
+        cached_table: Arc<CachedTable>,
+
+        /// ID of the only column within that table.
+        c_id: ColumnId,
+
+        /// Partitions within that table.
+        partitions: Vec<PartitionId>,
+    }
+
+    impl ConcurrencyTestState {
+        /// Prepare catalog state.
+        async fn prepare(ns: &Arc<TestNamespace>, name: &str) -> Self {
+            let t = ns.create_table(name).await;
+            let c = t.create_column("time", ColumnType::Time).await;
+            let cached_table = Arc::new(CachedTable {
+                id: t.table.id,
+                schema: schema(),
+                column_id_map: HashMap::from([(c.column.id, Arc::from(c.column.name.clone()))]),
+                column_id_map_rev: HashMap::from([(Arc::from(c.column.name.clone()), c.column.id)]),
+                primary_key_column_ids: [c.column.id].into(),
+                partition_template: TablePartitionTemplateOverride::default(),
+            });
+            const N_PARTITIONS: usize = 20;
+            let mut partitions = futures::stream::iter(0..N_PARTITIONS)
+                .then(|i| {
+                    let t = Arc::clone(&t);
+                    async move {
+                        t.create_partition_with_sort_key(&format!("p{i}"), &["time"])
+                            .await
+                            .partition
+                            .id
+                    }
+                })
+                .collect::<Vec<_>>()
+                .await;
+            partitions.sort();
+
+            Self {
+                cached_table,
+                c_id: c.column.id,
+                partitions,
+            }
+        }
+
+        /// Perform the actual [`PartitionCache::get`] call and run some basic sanity checks on the result.
+        async fn run(self, cache: Arc<PartitionCache>) {
+            let Self {
+                cached_table,
+                c_id,
+                partitions,
+            } = self;
+
+            let mut results = cache
+                .get(
+                    cached_table,
+                    partitions
+                        .iter()
+                        .map(|p| PartitionRequest {
+                            partition_id: *p,
+                            sort_key_should_cover: vec![],
+                        })
+                        .collect(),
+                    None,
+                )
+                .await;
+            results.sort_by_key(|p| p.id);
+            let partitions_res = results.iter().map(|p| p.id).collect::<Vec<_>>();
+            assert_eq!(partitions, partitions_res);
+            assert!(results
+                .iter()
+                .all(|p| p.sort_key.as_ref().unwrap().column_set == HashSet::from([c_id])));
+        }
    }

    fn schema() -> Schema {
        SchemaBuilder::new().build().unwrap()
    }
+
+    /// Extension methods for simpler testing.
+    #[async_trait]
+    trait PartitionCacheExt {
+        async fn get_one(
+            &self,
+            cached_table: Arc<CachedTable>,
+            partition_id: PartitionId,
+            sort_key_should_cover: &[ColumnId],
+            span: Option<Span>,
+        ) -> Option<CachedPartition>;
+    }
+
+    #[async_trait]
+    impl PartitionCacheExt for PartitionCache {
+        async fn get_one(
+            &self,
+            cached_table: Arc<CachedTable>,
+            partition_id: PartitionId,
+            sort_key_should_cover: &[ColumnId],
+            span: Option<Span>,
+        ) -> Option<CachedPartition> {
+            self.get(
+                cached_table,
+                vec![PartitionRequest {
+                    partition_id,
+                    sort_key_should_cover: sort_key_should_cover.to_vec(),
+                }],
+                span,
+            )
+            .await
+            .into_iter()
+            .next()
+        }
+    }
 }
--- a/querier/src/parquet/mod.rs
+++ b/querier/src/parquet/mod.rs
@ -106,6 +106,7 @@ pub mod tests {

    use crate::cache::{
        namespace::{CachedNamespace, CachedTable},
+        partition::PartitionRequest,
        CatalogCache,
    };

@ -249,11 +250,15 @@ pub mod tests {
                .partition()
                .get(
                    Arc::clone(&self.cached_table),
-                    self.parquet_file.partition_id,
-                    &[],
+                    vec![PartitionRequest {
+                        partition_id: self.parquet_file.partition_id,
+                        sort_key_should_cover: vec![],
+                    }],
                    None,
                )
                .await
+                .into_iter()
+                .next()
                .unwrap();
            let cached_partitions =
                HashMap::from([(self.parquet_file.partition_id, cached_partition)]);
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@ -1,17 +1,19 @@
 use self::query_access::QuerierTableChunkPruner;
 use crate::{
-    cache::{namespace::CachedTable, partition::CachedPartition},
+    cache::{
+        namespace::CachedTable,
+        partition::{CachedPartition, PartitionRequest},
+    },
    ingester::{self, IngesterPartition},
    parquet::ChunkAdapter,
-    IngesterConnection, CONCURRENT_CHUNK_CREATION_JOBS,
+    IngesterConnection,
 };
 use data_types::{ColumnId, NamespaceId, ParquetFile, PartitionId, TableId};
 use datafusion::error::DataFusionError;
-use futures::{join, StreamExt};
+use futures::join;
 use iox_query::{provider, provider::ChunkPruner, QueryChunk};
 use observability_deps::tracing::{debug, trace};
 use predicate::Predicate;
-use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
 use schema::Schema;
 use snafu::{ResultExt, Snafu};
 use std::{
@ -345,33 +347,26 @@ impl QuerierTable {
                .extend(f.column_set.iter().copied().filter(|id| pk.contains(id)));
        }

-        // shuffle order to even catalog load, because cache hits/misses might be correlated w/ the order of the
-        // partitions.
-        //
-        // Note that we sort before shuffling to achieve a deterministic pseudo-random order
-        let mut partitions = should_cover.into_iter().collect::<Vec<_>>();
-        let mut rng = StdRng::seed_from_u64(cached_table.id.get() as u64);
-        partitions.sort_by(|(a_p_id, _a_cols), (b_p_id, _b_cols)| a_p_id.cmp(b_p_id));
-        partitions.shuffle(&mut rng);
-
-        futures::stream::iter(partitions)
-            .map(|(p_id, cover)| {
-                let catalog_cache = self.chunk_adapter.catalog_cache();
-                let span = span_recorder.child_span("fetch partition");
-
-                async move {
-                    let cover = cover.into_iter().collect::<Vec<_>>();
-                    let cached_partition = catalog_cache
-                        .partition()
-                        .get(Arc::clone(cached_table), p_id, &cover, span)
-                        .await;
-                    cached_partition.map(|p| (p_id, p))
-                }
+        // batch request all partitions
+        let requests = should_cover
+            .into_iter()
+            .map(|(id, cover)| PartitionRequest {
+                partition_id: id,
+                sort_key_should_cover: cover.into_iter().collect(),
            })
-            .buffer_unordered(CONCURRENT_CHUNK_CREATION_JOBS)
-            .filter_map(|x| async move { x })
-            .collect::<HashMap<_, _>>()
-            .await
+            .collect();
+        let partitions = self
+            .chunk_adapter
+            .catalog_cache()
+            .partition()
+            .get(
+                Arc::clone(cached_table),
+                requests,
+                span_recorder.child_span("fetch partitions"),
+            )
+            .await;
+
+        partitions.into_iter().map(|p| (p.id, p)).collect()
    }

    /// Get a chunk pruner that can be used to prune chunks retrieved via [`chunks`](Self::chunks)
@ -891,12 +886,22 @@ mod tests {

        let chunks = querier_table.chunks().await.unwrap();
        assert_eq!(chunks.len(), 5);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 6);
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 2);

        let chunks = querier_table.chunks().await.unwrap();
        assert_eq!(chunks.len(), 5);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 6);
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 4);

        partition_2
@ -904,12 +909,22 @@ mod tests {
                TestParquetFileBuilder::default().with_line_protocol("table,tag1=a foo=1,bar=1 11"),
            )
            .await;
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 7);
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );

        // file not visible yet
        let chunks = querier_table.chunks().await.unwrap();
        assert_eq!(chunks.len(), 5);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 7);
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            1,
+        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 6);

        // change inster ID => invalidates cache
@ -918,7 +933,12 @@ mod tests {
            .with_ingester_partition(ingester_partition_builder.build());
        let chunks = querier_table.chunks().await.unwrap();
        assert_eq!(chunks.len(), 6);
-        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 8);
+        assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
+        assert_catalog_access_metric_count(
+            &catalog.metric_registry,
+            "partition_get_by_id_batch",
+            2,
+        );
        assert_cache_access_metric_count(&catalog.metric_registry, "partition", 8);
    }

--- a/tracker/Cargo.toml
+++ b/tracker/Cargo.toml
@ -19,7 +19,7 @@ tokio = { version = "1.29", features = ["macros", "parking_lot", "sync", "time"]
 tokio-util = { version = "0.7.8" }
 trace = { path = "../trace"}
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-sysinfo = "0.29.5"
+sysinfo = "0.29.6"

 [dev-dependencies]
 tempfile = "3.7.0"