Merge branch 'main' into 7899/wal-disk-metrics

2023-07-05 13:52:43 -07:00 · 2023-07-05 13:52:43 -07:00 · 36e7f53f9b
parent a02f7e7f3f 405c9e31a3
commit 36e7f53f9b
113 changed files with 2559 additions and 1458 deletions
--- a/.cargo/config
+++ b/.cargo/config
@ -3,9 +3,6 @@
 rustflags = [
    "--cfg", "tokio_unstable",
 ]
-rustdocflags = [
-    "--cfg", "tokio_unstable",
-]

 # sparse protocol opt-in
 # See https://blog.rust-lang.org/2023/03/09/Rust-1.68.0.html#cargos-sparse-protocol
--- a/Cargo.lock
+++ b/Cargo.lock
@ -190,7 +190,7 @@ dependencies = [
 "arrow-data",
 "arrow-schema",
 "chrono",
- "half 2.2.1",
+ "half 2.3.1",
 "num",
 ]

@ -205,7 +205,7 @@ dependencies = [
 "arrow-schema",
 "chrono",
 "chrono-tz",
- "half 2.2.1",
+ "half 2.3.1",
 "hashbrown 0.14.0",
 "num",
 ]
@ -215,7 +215,7 @@ name = "arrow-buffer"
 version = "42.0.0"
 source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#20f6bd7ed730d937abe76ab859088094dee8a5d3"
 dependencies = [
- "half 2.2.1",
+ "half 2.3.1",
 "num",
 ]

@ -231,7 +231,7 @@ dependencies = [
 "arrow-select",
 "chrono",
 "comfy-table",
- "half 2.2.1",
+ "half 2.3.1",
 "lexical-core",
 "num",
 ]
@ -261,7 +261,7 @@ source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/42.0.0_patched#
 dependencies = [
 "arrow-buffer",
 "arrow-schema",
- "half 2.2.1",
+ "half 2.3.1",
 "num",
 ]

@ -315,7 +315,7 @@ dependencies = [
 "arrow-data",
 "arrow-schema",
 "chrono",
- "half 2.2.1",
+ "half 2.3.1",
 "indexmap 1.9.3",
 "lexical-core",
 "num",
@ -333,7 +333,7 @@ dependencies = [
 "arrow-data",
 "arrow-schema",
 "arrow-select",
- "half 2.2.1",
+ "half 2.3.1",
 "num",
 ]

@ -347,7 +347,7 @@ dependencies = [
 "arrow-buffer",
 "arrow-data",
 "arrow-schema",
- "half 2.2.1",
+ "half 2.3.1",
 "hashbrown 0.14.0",
 ]

@ -480,18 +480,18 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
 name = "async-trait"
-version = "0.1.68"
+version = "0.1.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
+checksum = "79fa67157abdfd688a259b6648808757db9347af834624f27ec646da976aee5d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -621,9 +621,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dbe3c979c178231552ecba20214a8272df4e09f232a87aef4320cf06539aded"
+checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"

 [[package]]
 name = "blake2"
@ -841,9 +841,9 @@ dependencies = [

 [[package]]
 name = "clap"
-version = "4.3.9"
+version = "4.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bba77a07e4489fb41bd90e8d4201c3eb246b3c2c9ea2ba0bddd6c1d1df87db7d"
+checksum = "384e169cc618c613d5e3ca6404dda77a8685a63e08660dcc64abaf7da7cb0c7a"
 dependencies = [
 "clap_builder",
 "clap_derive",
@ -873,13 +873,12 @@ dependencies = [

 [[package]]
 name = "clap_builder"
-version = "4.3.9"
+version = "4.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c9b4a88bb4bc35d3d6f65a21b0f0bafe9c894fa00978de242c555ec28bea1c0"
+checksum = "ef137bbe35aab78bdb468ccfba75a5f4d8321ae011d34063770780545176af2d"
 dependencies = [
 "anstream",
 "anstyle",
- "bitflags 1.3.2",
 "clap_lex",
 "once_cell",
 "strsim",
@ -894,7 +893,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -969,7 +968,6 @@ dependencies = [
 "object_store",
 "observability_deps",
 "parquet_file",
- "predicate",
 "rand",
 "schema",
 "test_helpers",
@ -1059,9 +1057,9 @@ dependencies = [

 [[package]]
 name = "console-subscriber"
-version = "0.1.9"
+version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57ab2224a0311582eb03adba4caaf18644f7b1f10a760803a803b9b605187fc7"
+checksum = "d4cf42660ac07fcebed809cfe561dd8730bcd35b075215e6479c516bcd0d11cb"
 dependencies = [
 "console-api",
 "crossbeam-channel",
@ -1118,9 +1116,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpp_demangle"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c76f98bdfc7f66172e6c7065f981ebb576ffc903fe4c0561d9f0c2509226dc6"
+checksum = "ee34052ee3d93d6d8f3e6f81d85c47921f6653a19a7b70e939e3e602d893a674"
 dependencies = [
 "cfg-if",
 ]
@ -1479,7 +1477,7 @@ dependencies = [
 "datafusion-common",
 "datafusion-expr",
 "datafusion-row",
- "half 2.2.1",
+ "half 2.3.1",
 "hashbrown 0.14.0",
 "indexmap 1.9.3",
 "itertools 0.10.5",
@ -1720,12 +1718,12 @@ dependencies = [

 [[package]]
 name = "fd-lock"
-version = "3.0.12"
+version = "3.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39ae6b3d9530211fb3b12a95374b8b0823be812f53d09e18c5675c0146b09642"
+checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5"
 dependencies = [
 "cfg-if",
- "rustix",
+ "rustix 0.38.2",
 "windows-sys 0.48.0",
 ]

@ -1899,7 +1897,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -2062,9 +2060,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.19"
+version = "0.3.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
+checksum = "97ec8491ebaf99c8eaa73058b045fe58073cd6be7f596ac993ced0b0a0c01049"
 dependencies = [
 "bytes",
 "fnv",
@ -2087,10 +2085,11 @@ checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"

 [[package]]
 name = "half"
-version = "2.2.1"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0"
+checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
 dependencies = [
+ "cfg-if",
 "crunchy",
 "num-traits",
 ]
@ -2150,7 +2149,7 @@ dependencies = [
 [[package]]
 name = "heappy"
 version = "0.1.0"
-source = "git+https://github.com/mkmik/heappy?rev=1d6ac77a4026fffce8680a7b31a9f6e9859b5e73#1d6ac77a4026fffce8680a7b31a9f6e9859b5e73"
+source = "git+https://github.com/mkmik/heappy?rev=1de977a241cdd768acc5b6c82c0728b30c7db7b4#1de977a241cdd768acc5b6c82c0728b30c7db7b4"
 dependencies = [
 "backtrace",
 "bytes",
@ -2173,9 +2172,9 @@ dependencies = [

 [[package]]
 name = "hermit-abi"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"

 [[package]]
 name = "hex"
@ -2571,7 +2570,7 @@ version = "0.1.0"
 dependencies = [
 "flate2",
 "hex",
- "integer-encoding",
+ "integer-encoding 4.0.0",
 "observability_deps",
 "rand",
 "snafu",
@ -2730,6 +2729,12 @@ version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"

+[[package]]
+name = "integer-encoding"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "924df4f0e24e2e7f9cdd90babb0b96f93b20f3ecfa949ea9e6613756b8c8e1bf"
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@ -3099,19 +3104,18 @@ dependencies = [

 [[package]]
 name = "ipnet"
-version = "2.7.2"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
+checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"

 [[package]]
 name = "is-terminal"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
+checksum = "24fddda5af7e54bf7da53067d6e802dbcc381d0a8eef629df528e3ebf68755cb"
 dependencies = [
 "hermit-abi",
- "io-lifetimes",
- "rustix",
+ "rustix 0.38.2",
 "windows-sys 0.48.0",
 ]

@ -3135,9 +3139,9 @@ dependencies = [

 [[package]]
 name = "itoa"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+checksum = "c0aa48fab2893d8a49caa94082ae8488f4e1050d73b367881dcd2198f4199fd8"

 [[package]]
 name = "jobserver"
@ -3262,6 +3266,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

+[[package]]
+name = "linux-raw-sys"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@ -3846,7 +3856,7 @@ dependencies = [
 "libc",
 "redox_syscall 0.3.5",
 "smallvec",
- "windows-targets 0.48.0",
+ "windows-targets 0.48.1",
 ]

 [[package]]
@ -3941,9 +3951,9 @@ dependencies = [

 [[package]]
 name = "paste"
-version = "1.0.12"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
+checksum = "b4b27ab7be369122c218afc2079489cdcb4b517c0a3fc386ff11e1fedfcc2b35"

 [[package]]
 name = "pbjson"
@ -4027,7 +4037,7 @@ dependencies = [
 "pest_meta",
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -4053,18 +4063,18 @@ dependencies = [

 [[package]]
 name = "phf"
-version = "0.11.1"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
 dependencies = [
 "phf_shared",
 ]

 [[package]]
 name = "phf_codegen"
-version = "0.11.1"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a56ac890c5e3ca598bbdeaa99964edb5b0258a583a9eb6ef4e89fc85d9224770"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
 dependencies = [
 "phf_generator",
 "phf_shared",
@ -4072,9 +4082,9 @@ dependencies = [

 [[package]]
 name = "phf_generator"
-version = "0.11.1"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf"
+checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
 dependencies = [
 "phf_shared",
 "rand",
@ -4091,29 +4101,29 @@ dependencies = [

 [[package]]
 name = "pin-project"
-version = "1.1.1"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e138fdd8263907a2b0e1b4e80b7e58c721126479b6e6eedfb1b402acea7b9bd"
+checksum = "030ad2bc4db10a8944cb0d837f158bdfec4d4a4873ab701a95046770d11f8842"
 dependencies = [
 "pin-project-internal",
 ]

 [[package]]
 name = "pin-project-internal"
-version = "1.1.1"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1fef411b303e3e12d534fb6e7852de82da56edd937d895125821fb7c09436c7"
+checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
 name = "pin-project-lite"
-version = "0.2.9"
+version = "0.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
+checksum = "4c40d25201921e5ff0c862a505c6557ea88568a4e3ace775ab55e93f2f4f9d57"

 [[package]]
 name = "pin-utils"
@ -4129,9 +4139,9 @@ checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"

 [[package]]
 name = "pprof"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "196ded5d4be535690899a4631cc9f18cdc41b7ebf24a79400f46f48e49a11059"
+checksum = "6b90f8560ad8bd57b207b8293bc5226e48e89039a6e590c12a297d91b84c7e60"
 dependencies = [
 "backtrace",
 "cfg-if",
@ -4239,9 +4249,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.60"
+version = "1.0.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
+checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb"
 dependencies = [
 "unicode-ident",
 ]
@ -4428,9 +4438,9 @@ dependencies = [

 [[package]]
 name = "quote"
-version = "1.0.28"
+version = "1.0.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
+checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105"
 dependencies = [
 "proc-macro2",
 ]
@ -4693,15 +4703,28 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.37.21"
+version = "0.37.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62f25693a73057a1b4cb56179dd3c7ea21a7c6c5ee7d85781f5749b46f34b79c"
+checksum = "8818fa822adcc98b18fedbb3632a6a33213c070556b5aa7c4c8cc21cff565c4c"
 dependencies = [
 "bitflags 1.3.2",
 "errno",
 "io-lifetimes",
 "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aabcb0461ebd01d6b79945797c27f8529082226cb630a9865a71870ff63532a4"
+dependencies = [
+ "bitflags 2.3.3",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.3",
 "windows-sys 0.48.0",
 ]

@ -4731,9 +4754,9 @@ dependencies = [

 [[package]]
 name = "rustls-pemfile"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
+checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2"
 dependencies = [
 "base64 0.21.2",
 ]
@ -4760,7 +4783,7 @@ version = "12.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "994eca4bca05c87e86e15d90fc7a91d1be64b4482b38cb2d27474568fe7c9db9"
 dependencies = [
- "bitflags 2.3.2",
+ "bitflags 2.3.3",
 "cfg-if",
 "clipboard-win",
 "fd-lock",
@ -4832,29 +4855,29 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc"

 [[package]]
 name = "serde"
-version = "1.0.164"
+version = "1.0.166"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
+checksum = "d01b7404f9d441d3ad40e6a636a7782c377d2abdbe4fa2440e2edcc2f4f10db8"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.164"
+version = "1.0.166"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
+checksum = "5dd83d6dde2b6b2d466e14d9d1acce8816dedee94f735eac6395808b3483c6d6"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
 name = "serde_json"
-version = "1.0.99"
+version = "1.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46266871c240a00b8f503b877622fe33430b3c7d963bdc0f2adc511e54a1eae3"
+checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c"
 dependencies = [
 "itoa",
 "ryu",
@ -5423,7 +5446,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "rustversion",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -5434,9 +5457,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"

 [[package]]
 name = "symbolic-common"
-version = "10.2.1"
+version = "12.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b55cdc318ede251d0957f07afe5fed912119b8c1bc5a7804151826db999e737"
+checksum = "38f7afd8bcd36190409e6b71d89928f7f09d918a7aa3460d847bc49a538d672e"
 dependencies = [
 "debugid",
 "memmap2",
@ -5446,9 +5469,9 @@ dependencies = [

 [[package]]
 name = "symbolic-demangle"
-version = "10.2.1"
+version = "12.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79be897be8a483a81fff6a3a4e195b4ac838ef73ca42d348b3f722da9902e489"
+checksum = "ec64922563a36e3fe686b6d99f06f25dacad2a202ac7502ed642930a188fb20a"
 dependencies = [
 "cpp_demangle",
 "rustc-demangle",
@ -5468,9 +5491,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.18"
+version = "2.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
+checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737"
 dependencies = [
 "proc-macro2",
 "quote",
@ -5514,7 +5537,7 @@ dependencies = [
 "cfg-if",
 "fastrand",
 "redox_syscall 0.3.5",
- "rustix",
+ "rustix 0.37.22",
 "windows-sys 0.48.0",
 ]

@ -5579,22 +5602,22 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "c16a64ba9387ef3fdae4f9c1a7f07a0997fce91985c0336f1ddc1822b3b37802"
 dependencies = [
 "thiserror-impl",
 ]

 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "d14928354b01c4d6a4f0e549069adef399a284e7995c7ccca94e8a07a5346c59"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -5623,7 +5646,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
 dependencies = [
 "byteorder",
- "integer-encoding",
+ "integer-encoding 3.0.4",
 "log",
 "ordered-float 2.10.0",
 "threadpool",
@ -5723,7 +5746,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -5914,7 +5937,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8bd22a874a2d0b70452d5597b12c537331d49060824a95f49f108994f94aa4c"
 dependencies = [
- "bitflags 2.3.2",
+ "bitflags 2.3.3",
 "bytes",
 "futures-core",
 "futures-util",
@ -6007,7 +6030,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 ]

 [[package]]
@ -6319,7 +6342,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 "wasm-bindgen-shared",
 ]

@ -6353,7 +6376,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.18",
+ "syn 2.0.23",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@ -6428,9 +6451,9 @@ dependencies = [

 [[package]]
 name = "whoami"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c70234412ca409cc04e864e89523cb0fc37f5e1344ebed5a3ebf4192b6b9f68"
+checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
 dependencies = [
 "wasm-bindgen",
 "web-sys",
@ -6473,7 +6496,7 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
 dependencies = [
- "windows-targets 0.48.0",
+ "windows-targets 0.48.1",
 ]

 [[package]]
@ -6491,7 +6514,7 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets 0.48.0",
+ "windows-targets 0.48.1",
 ]

 [[package]]
@ -6511,9 +6534,9 @@ dependencies = [

 [[package]]
 name = "windows-targets"
-version = "0.48.0"
+version = "0.48.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
 dependencies = [
 "windows_aarch64_gnullvm 0.48.0",
 "windows_aarch64_msvc 0.48.0",
@ -6664,7 +6687,6 @@ dependencies = [
 "hashbrown 0.14.0",
 "heck",
 "indexmap 1.9.3",
- "io-lifetimes",
 "itertools 0.10.5",
 "libc",
 "lock_api",
@ -6690,7 +6712,7 @@ dependencies = [
 "regex-syntax 0.7.2",
 "reqwest",
 "ring",
- "rustix",
+ "rustix 0.38.2",
 "rustls 0.21.2",
 "scopeguard",
 "serde",
@ -6702,7 +6724,7 @@ dependencies = [
 "sqlx-core",
 "sqlx-macros",
 "syn 1.0.109",
- "syn 2.0.18",
+ "syn 2.0.23",
 "thrift",
 "tokio",
 "tokio-stream",
--- a/authz/Cargo.toml
+++ b/authz/Cargo.toml
@ -25,7 +25,7 @@ tonic = { workspace = true }
 [dev-dependencies]
 assert_matches = "1.5.0"
 parking_lot = "0.12.1"
-paste = "1.0.12"
+paste = "1.0.13"
 test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
 tokio = "1.29.1"

--- a/cache_system/Cargo.toml
+++ b/cache_system/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { path = "../backoff" }
 futures = "0.3"
 iox_time = { path = "../iox_time" }
--- a/client_util/Cargo.toml
+++ b/client_util/Cargo.toml
@ -9,7 +9,7 @@ license.workspace = true
 [dependencies]
 http = "0.2.9"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tonic = { workspace = true }
 tower = "0.4"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/compactor/Cargo.toml
+++ b/compactor/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { path = "../backoff" }
 bytes = "1.4"
 compactor_scheduler = { path = "../compactor_scheduler" }
@ -21,7 +21,6 @@ metric = { path = "../metric" }
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
 parquet_file = { path = "../parquet_file" }
-predicate = { path = "../predicate" }
 rand = "0.8.3"
 schema = { path = "../schema" }
 tokio = { version = "1", features = ["macros", "rt", "sync"] }
--- a/compactor/src/components/df_planner/query_chunk.rs
+++ b/compactor/src/components/df_planner/query_chunk.rs
@ -2,15 +2,10 @@
 use std::{any::Any, sync::Arc};

 use data_types::{ChunkId, ChunkOrder, PartitionId};
-use datafusion::{error::DataFusionError, physical_plan::Statistics};
-use iox_query::{
-    exec::{stringset::StringSet, IOxSessionContext},
-    util::create_basic_summary,
-    QueryChunk, QueryChunkData,
-};
+use datafusion::physical_plan::Statistics;
+use iox_query::{util::create_basic_summary, QueryChunk, QueryChunkData};
 use observability_deps::tracing::debug;
 use parquet_file::{chunk::ParquetChunk, storage::ParquetStorage};
-use predicate::Predicate;
 use schema::{merge::SchemaMerger, sort::SortKey, Schema};
 use uuid::Uuid;

@ -96,20 +91,6 @@ impl QueryChunk for QueryableParquetChunk {
        false
    }

-    /// Return a set of Strings containing the distinct values in the
-    /// specified columns. If the predicate can be evaluated entirely
-    /// on the metadata of this Chunk. Returns `None` otherwise
-    ///
-    /// The requested columns must all have String type.
-    fn column_values(
-        &self,
-        _ctx: IOxSessionContext,
-        _column_name: &str,
-        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        Ok(None)
-    }
-
    fn data(&self) -> QueryChunkData {
        QueryChunkData::Parquet(self.data.parquet_exec_input())
    }
--- a/compactor_scheduler/Cargo.toml
+++ b/compactor_scheduler/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { path = "../backoff" }
 data_types = { path = "../data_types" }
 iox_catalog = { path = "../iox_catalog" }
--- a/compactor_scheduler/src/local_scheduler/partitions_source/catalog_to_compact.rs
+++ b/compactor_scheduler/src/local_scheduler/partitions_source/catalog_to_compact.rs
@ -74,18 +74,21 @@ impl PartitionsSource for CatalogToCompactPartitionsSource {
            // we're going check the time range we'd like to query for against the end time of the last query.
            let mut last = self.last_maximum_time.lock().unwrap();

-            // if the last query ended further back in time than this query starts, we're about to skip something.
-            if *last < minimum_time {
-                if minimum_time.sub(*last) < self.min_threshold * 3 {
-                    // the end of the last query says we're skipping less than 3x our configured lookback, so
-                    // back up and query everything since the last query.
-                    minimum_time = *last;
-                } else {
-                    // end of the last query says we're skiping a lot.  We should limit how far we lookback to avoid
-                    // returning all partitions, so we'll just backup 3x the configured lookback.
-                    // this might skip something (until cold compaction), but we need a limit in how far we look back.
-                    minimum_time = self.time_provider.now() - self.min_threshold * 3;
-                }
+            // query for partitions with activity since the last query.  We shouldn't query for a time range
+            // we've already covered.  So if the prior query was 2m ago, and the query covered 10m, ending at
+            // the time of that query, we just need to query for activity in the last 2m.  Asking for more than
+            // that creates busy-work that will spam the catalog with more queries to determine no compaction
+            // nneded.  But we also don't want to query so far back in time that we get all partitions, so the
+            // lookback is limited to 3x the configured threshold.
+            if minimum_time < *last || minimum_time.sub(*last) < self.min_threshold * 3 {
+                // the end of the last query is less than 3x our configured lookback, so we can query everything
+                // since the last query.
+                minimum_time = *last;
+            } else {
+                // end of the last query says we're skiping a lot.  We should limit how far we lookback to avoid
+                // returning all partitions, so we'll just backup 3x the configured lookback.
+                // this might skip something (until cold compaction), but we need a limit in how far we look back.
+                minimum_time = self.time_provider.now() - self.min_threshold * 3;
            }
            maximum_time = self.max_threshold.map(|max| self.time_provider.now() - max);

@ -113,6 +116,7 @@ mod tests {
    use data_types::Timestamp;
    use iox_catalog::mem::MemCatalog;
    use iox_tests::PartitionBuilder;
+    use iox_time::MockProvider;

    fn partition_ids(ids: &[i64]) -> Vec<PartitionId> {
        ids.iter().cloned().map(PartitionId::new).collect()
@ -122,17 +126,18 @@ mod tests {
        catalog: Arc<MemCatalog>,
        min_threshold: Duration,
        max_threshold: Option<Duration>,
+        second_query_delta: Duration, // time between first and second query
        first_expected_ids: &[i64], // expected values on first fetch, which does a 3x on min_threshold
        second_expected_ids: &[i64], // expected values on second fetch, which uses min_threshold unmodified
    ) {
-        let time_provider = catalog.time_provider();
+        let time_provider = Arc::new(MockProvider::new(catalog.time_provider().now()));

        let partitions_source = CatalogToCompactPartitionsSource::new(
            Default::default(),
            catalog,
            min_threshold,
            max_threshold,
-            time_provider,
+            Arc::<iox_time::MockProvider>::clone(&time_provider),
        );

        let mut actual_partition_ids = partitions_source.fetch().await;
@ -145,6 +150,7 @@ mod tests {
            max_threshold {max_threshold:?} failed (first fetch, 3x lookback)",
        );

+        time_provider.inc(second_query_delta);
        let mut actual_partition_ids = partitions_source.fetch().await;
        actual_partition_ids.sort();

@ -163,10 +169,15 @@ mod tests {

        let time_three_hour_ago = Timestamp::from(time_provider.hours_ago(3));
        let time_six_hour_ago = Timestamp::from(time_provider.hours_ago(6));
+        let time_one_min_future = Timestamp::from(time_provider.minutes_into_future(1));

-        for (id, time) in [(1, time_three_hour_ago), (2, time_six_hour_ago)]
-            .iter()
-            .cloned()
+        for (id, time) in [
+            (1, time_three_hour_ago),
+            (2, time_six_hour_ago),
+            (3, time_one_min_future),
+        ]
+        .iter()
+        .cloned()
        {
            let partition = PartitionBuilder::new(id as i64)
                .with_new_file_at(time)
@ -175,13 +186,44 @@ mod tests {
        }

        let one_minute = Duration::from_secs(60);
-        fetch_test(Arc::clone(&catalog), one_minute, None, &[], &[]).await;
+        let ten_minute = Duration::from_secs(60) * 10;
+
+        // the lack of end time means it gets the future file (3) in the first query, this is an
+        // oddity of a test case that has files with a future timestamp (not a real world concern).
+        // the second query 10m later with a cap of 3m lookback doesn't get it.
+        fetch_test(
+            Arc::clone(&catalog),
+            one_minute,
+            None,
+            ten_minute,
+            &[3],
+            &[],
+        )
+        .await;

        let four_hours = Duration::from_secs(60 * 60 * 4);
-        fetch_test(Arc::clone(&catalog), four_hours, None, &[1, 2], &[1]).await;
+        // again the future file is included in he first query, just an oddity of the test case.
+        fetch_test(
+            Arc::clone(&catalog),
+            four_hours,
+            None,
+            ten_minute,
+            &[1, 2, 3],
+            &[3],
+        )
+        .await;

        let seven_hours = Duration::from_secs(60 * 60 * 7);
-        fetch_test(Arc::clone(&catalog), seven_hours, None, &[1, 2], &[1, 2]).await;
+        // again the future file is included in he first query, just an oddity of the test case.
+        fetch_test(
+            Arc::clone(&catalog),
+            seven_hours,
+            None,
+            ten_minute,
+            &[1, 2, 3],
+            &[3],
+        )
+        .await;
    }

    #[tokio::test]
@ -192,11 +234,13 @@ mod tests {
        let time_now = Timestamp::from(time_provider.now());
        let time_three_hour_ago = Timestamp::from(time_provider.hours_ago(3));
        let time_six_hour_ago = Timestamp::from(time_provider.hours_ago(6));
+        let time_one_min_future = Timestamp::from(time_provider.minutes_into_future(1));

        for (id, time) in [
            (1, time_now),
            (2, time_three_hour_ago),
            (3, time_six_hour_ago),
+            (4, time_one_min_future),
        ]
        .iter()
        .cloned()
@ -209,54 +253,80 @@ mod tests {

        let one_minute = Duration::from_secs(60);
        let one_hour = Duration::from_secs(60 * 60);
+        let two_hour = Duration::from_secs(60 * 60 * 2);
        let four_hours = Duration::from_secs(60 * 60 * 4);
        let seven_hours = Duration::from_secs(60 * 60 * 7);

+        // File 3 is all that falls within the 7-4h lookback window.  With 1m to the next query,
+        // nothing is found with windows advanced by 1m.
        fetch_test(
            Arc::clone(&catalog),
            seven_hours,
            Some(four_hours),
+            one_minute,
            &[3],
-            &[3],
+            &[],
        )
        .await;

+        // With a 7-1h lookback window, files 2 and 3 are found.  With 2h to the next query, the
+        // window advances to find the two newer files.
        fetch_test(
            Arc::clone(&catalog),
            seven_hours,
            Some(one_hour),
+            two_hour,
            &[2, 3],
-            &[2, 3],
+            &[1, 4],
        )
        .await;

+        // With a 7h-1m lookback window, files 2 and 3 are found.  With 1m to the next query, the
+        // window advances to find the one newer file.
        fetch_test(
            Arc::clone(&catalog),
            seven_hours,
            Some(one_minute),
+            one_minute,
            &[2, 3],
-            &[2, 3],
+            &[1],
        )
        .await;

+        // With a 4h-1h lookback window, files 2 and 3 are found.  With 1m to the next query, there's
+        // nothing new in the next window.
        fetch_test(
            Arc::clone(&catalog),
            four_hours,
            Some(one_hour),
+            one_minute,
            &[2, 3],
-            &[2],
+            &[],
        )
        .await;

+        // With a 4h-1m lookback window, files 2 and 3 are found.  With 4h to the next query, the
+        // remaining files are found.
        fetch_test(
            Arc::clone(&catalog),
            four_hours,
            Some(one_minute),
+            four_hours,
            &[2, 3],
-            &[2],
+            &[1, 4],
        )
        .await;

-        fetch_test(Arc::clone(&catalog), one_hour, Some(one_minute), &[], &[]).await;
+        // With a 1h-1m lookback window, nothing is found.  In the second query 1m later, it finds
+        // the file create 'now'.
+        fetch_test(
+            Arc::clone(&catalog),
+            one_hour,
+            Some(one_minute),
+            one_minute,
+            &[],
+            &[1],
+        )
+        .await;
    }
 }
--- a/compactor_test_utils/Cargo.toml
+++ b/compactor_test_utils/Cargo.toml
@ -7,7 +7,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { path = "../backoff" }
 compactor = { path = "../compactor" }
 compactor_scheduler = { path = "../compactor_scheduler" }
--- a/data_types/Cargo.toml
+++ b/data_types/Cargo.toml
@ -18,14 +18,14 @@ ordered-float = "3"
 schema = { path = "../schema" }
 sha2 = "0.10"
 sqlx = { version = "0.6", features = ["runtime-tokio-rustls", "postgres", "uuid"] }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 percent-encoding = "2.2.0"

 [dev-dependencies] # In alphabetical order
 assert_matches = "1"
-paste = "1.0.12"
+paste = "1.0.13"
 proptest = { version = "1.2.0", default-features = false }
 test_helpers = { path = "../test_helpers" }
 hex = "0.4.2"
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@ -160,33 +160,32 @@ impl std::fmt::Display for TableId {
    }
 }

-/// A sequence number from a `router::Shard` (kafka partition)
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
-#[sqlx(transparent)]
-pub struct SequenceNumber(i64);
+/// A sequence number from an ingester
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct SequenceNumber(u64);

 #[allow(missing_docs)]
 impl SequenceNumber {
-    pub fn new(v: i64) -> Self {
+    pub fn new(v: u64) -> Self {
        Self(v)
    }
-    pub fn get(&self) -> i64 {
+    pub fn get(&self) -> u64 {
        self.0
    }
 }

-impl Add<i64> for SequenceNumber {
+impl Add<u64> for SequenceNumber {
    type Output = Self;

-    fn add(self, other: i64) -> Self {
+    fn add(self, other: u64) -> Self {
        Self(self.0 + other)
    }
 }

-impl Sub<i64> for SequenceNumber {
+impl Sub<u64> for SequenceNumber {
    type Output = Self;

-    fn sub(self, other: i64) -> Self {
+    fn sub(self, other: u64) -> Self {
        Self(self.0 - other)
    }
 }
@ -614,7 +613,13 @@ impl ParquetFile {

    /// Estimate the memory consumption of this object and its contents
    pub fn size(&self) -> usize {
-        std::mem::size_of_val(self) + self.column_set.size()
+        std::mem::size_of_val(self)
+            + self
+                .partition_hash_id
+                .as_ref()
+                .map(|id| id.size() - std::mem::size_of_val(id))
+                .unwrap_or_default()
+            + self.column_set.size()
            - std::mem::size_of_val(&self.column_set)
    }

--- a/data_types/src/partition.rs
+++ b/data_types/src/partition.rs
@ -19,6 +19,18 @@ pub enum TransitionPartitionId {
    Deterministic(PartitionHashId),
 }

+impl TransitionPartitionId {
+    /// Size in bytes including `self`.
+    pub fn size(&self) -> usize {
+        match self {
+            Self::Deprecated(_) => std::mem::size_of::<Self>(),
+            Self::Deterministic(id) => {
+                std::mem::size_of::<Self>() + id.size() - std::mem::size_of_val(id)
+            }
+        }
+    }
+}
+
 impl std::fmt::Display for TransitionPartitionId {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
@ -216,6 +228,11 @@ impl PartitionHashId {
    pub fn as_bytes(&self) -> &[u8] {
        self.0.as_ref()
    }
+
+    /// Size in bytes including `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.0.len()
+    }
 }

 impl<'q> sqlx::encode::Encode<'q, sqlx::Postgres> for &'q PartitionHashId {
--- a/data_types/src/sequence_number_set.rs
+++ b/data_types/src/sequence_number_set.rs
@ -207,18 +207,18 @@ mod tests {

    #[test]
    fn test_intersect() {
-        let a = [0, i64::MAX, 40, 41, 42, 43, 44, 45]
+        let a = [0, u64::MAX, 40, 41, 42, 43, 44, 45]
            .into_iter()
            .map(SequenceNumber::new)
            .collect::<SequenceNumberSet>();

-        let b = [1, 5, i64::MAX, 42]
+        let b = [1, 5, u64::MAX, 42]
            .into_iter()
            .map(SequenceNumber::new)
            .collect::<SequenceNumberSet>();

        let intersection = intersect(&a, &b);
-        let want = [i64::MAX, 42]
+        let want = [u64::MAX, 42]
            .into_iter()
            .map(SequenceNumber::new)
            .collect::<SequenceNumberSet>();
@ -226,21 +226,17 @@ mod tests {
        assert_eq!(intersection, want);
    }

-    /// Yield vec's of [`SequenceNumber`] derived from u64 values and cast to
-    /// i64.
+    /// Yield vec's of [`SequenceNumber`] derived from u64 values.
    ///
    /// This matches how the ingester allocates [`SequenceNumber`] - from a u64
    /// source.
    fn sequence_number_vec() -> impl Strategy<Value = Vec<SequenceNumber>> {
-        prop::collection::vec(0..u64::MAX, 0..1024).prop_map(|vec| {
-            vec.into_iter()
-                .map(|v| SequenceNumber::new(v as i64))
-                .collect()
-        })
+        prop::collection::vec(0..u64::MAX, 0..1024)
+            .prop_map(|vec| vec.into_iter().map(SequenceNumber::new).collect())
    }

    // The following tests compare to an order-independent HashSet, as the
-    // SequenceNumber uses the PartialOrd impl of the inner i64 for ordering,
+    // SequenceNumber uses the PartialOrd impl of the inner u64 for ordering,
    // resulting in incorrect output when compared to an ordered set of cast as
    // u64.
    //
--- a/executor/src/lib.rs
+++ b/executor/src/lib.rs
@ -16,8 +16,11 @@
 )]

 use metric::Registry;
+#[cfg(tokio_unstable)]
 use tokio_metrics_bridge::setup_tokio_metrics;
 // Workaround for "unused crate" lint false positives.
+#[cfg(not(tokio_unstable))]
+use tokio_metrics_bridge as _;
 use workspace_hack as _;

 use once_cell::sync::Lazy;
@ -242,7 +245,10 @@ impl DedicatedExecutor {
                    .build()
                    .expect("Creating tokio runtime");

+                #[cfg(tokio_unstable)]
                setup_tokio_metrics(runtime.metrics(), thread_name, metric_registry);
+                #[cfg(not(tokio_unstable))]
+                let _ = metric_registry;

                runtime.block_on(async move {
                    // Dropping the tokio runtime only waits for tasks to yield not to complete
--- a/import_export/Cargo.toml
+++ b/import_export/Cargo.toml
@ -9,8 +9,8 @@ license.workspace = true
 futures-util = { version = "0.3" }
 influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] }
 observability_deps = { path = "../observability_deps" }
-serde_json = "1.0.99"
-thiserror = "1.0.40"
+serde_json = "1.0.100"
+thiserror = "1.0.41"
 tokio = { version = "1.29" }
 tokio-util = { version = "0.7.8" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@ -10,7 +10,7 @@ bytes = "1.4"
 futures = { version = "0.3", default-features = false }
 reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 snafu = "0.7"
 url = "2.4.0"
 uuid = { version = "1", features = ["v4"] }
--- a/influxdb_influxql_parser/Cargo.toml
+++ b/influxdb_influxql_parser/Cargo.toml
@ -18,4 +18,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 test_helpers = { path = "../test_helpers" }
 assert_matches = "1"
 insta = { version = "1.30.0", features = ["yaml"] }
-paste = "1.0.12"
+paste = "1.0.13"
--- a/influxdb_iox/Cargo.toml
+++ b/influxdb_iox/Cargo.toml
@ -52,7 +52,7 @@ backtrace = "0.3"
 bytes = "1.4"
 clap = { version = "4", features = ["derive", "env"] }
 comfy-table = { version = "7.0", default-features = false }
-console-subscriber = { version = "0.1.9", optional = true, features = ["parking_lot"] }
+console-subscriber = { version = "0.1.10", optional = true, features = ["parking_lot"] }
 dotenvy = "0.15.7"
 futures = "0.3"
 futures-util = { version = "0.3" }
@ -67,10 +67,10 @@ libc = { version = "0.2" }
 num_cpus = "1.16.0"
 once_cell = { version = "1.18", features = ["parking_lot"] }
 rustyline = { version = "12.0", default-features = false, features = ["with-file-history"]}
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 snafu = "0.7"
 tempfile = "3.6.0"
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
 tokio = { version = "1.29", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time", "io-std"] }
 tokio-stream = { version = "0.1", features = ["net"] }
@ -93,7 +93,7 @@ predicate = { path = "../predicate" }
 predicates = "3.0.3"
 pretty_assertions = "1.3.0"
 proptest = { version = "1.2.0", default-features = false }
-serde = "1.0.164"
+serde = "1.0.166"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
 test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
 insta = { version = "1", features = ["yaml"] }
--- a/influxdb_iox/src/process_info.rs
+++ b/influxdb_iox/src/process_info.rs
@ -3,7 +3,8 @@ use std::sync::Arc;
 use iox_time::{SystemProvider, Time, TimeProvider};
 use metric::U64Gauge;
 use once_cell::sync::Lazy;
-use tokio::runtime::Handle;
+
+#[cfg(tokio_unstable)]
 use tokio_metrics_bridge::setup_tokio_metrics;

 /// Package version.
@ -54,7 +55,12 @@ pub fn setup_metric_registry() -> Arc<metric::Registry> {
    registry.register_instrument("jemalloc_metrics", crate::jemalloc::JemallocMetrics::new);

    // Register tokio metric for main runtime
-    setup_tokio_metrics(Handle::current().metrics(), "main", Arc::clone(&registry));
+    #[cfg(tokio_unstable)]
+    setup_tokio_metrics(
+        tokio::runtime::Handle::current().metrics(),
+        "main",
+        Arc::clone(&registry),
+    );

    registry
 }
--- a/influxdb_iox/tests/end_to_end_cases/cli.rs
+++ b/influxdb_iox/tests/end_to_end_cases/cli.rs
@ -1323,10 +1323,15 @@ async fn assert_ingester_contains_results(
        .await
        .unwrap();

-    let ingester_uuid = ingester_response.app_metadata.ingester_uuid;
+    let ingester_partition = ingester_response
+        .partitions
+        .into_iter()
+        .next()
+        .expect("at least one ingester partition");
+    let ingester_uuid = ingester_partition.app_metadata.ingester_uuid;
    assert!(!ingester_uuid.is_empty());

-    assert_batches_sorted_eq!(expected, &ingester_response.record_batches);
+    assert_batches_sorted_eq!(expected, &ingester_partition.record_batches);
 }

 #[tokio::test]
--- a/influxdb_iox/tests/end_to_end_cases/ingester.rs
+++ b/influxdb_iox/tests/end_to_end_cases/ingester.rs
@ -1,8 +1,14 @@
+use arrow::datatypes::DataType;
 use arrow_flight::{error::FlightError, Ticket};
 use arrow_util::assert_batches_sorted_eq;
 use data_types::{NamespaceId, TableId};
+use datafusion::{
+    prelude::{col, lit},
+    scalar::ScalarValue,
+};
 use futures::FutureExt;
 use http::StatusCode;
+use influxdb_iox_client::table::generated_types::{Part, PartitionTemplate, TemplatePart};
 use ingester_query_grpc::{influxdata::iox::ingester::v1 as proto, IngesterQueryRequest};
 use prost::Message;
 use test_helpers_end_to_end::{maybe_skip_integration, MiniCluster, Step, StepTest, StepTestState};
@ -39,7 +45,14 @@ async fn persist_on_demand() {
                        .await
                        .unwrap();

-                    let ingester_uuid = ingester_response.app_metadata.ingester_uuid;
+                    assert_eq!(ingester_response.partitions.len(), 1);
+                    let ingester_partition = ingester_response
+                        .partitions
+                        .into_iter()
+                        .next()
+                        .expect("just checked len");
+
+                    let ingester_uuid = ingester_partition.app_metadata.ingester_uuid;
                    assert!(!ingester_uuid.is_empty());

                    let expected = [
@ -49,7 +62,7 @@ async fn persist_on_demand() {
                        "| A    | B    | 1970-01-01T00:00:00.000123456Z | 42  |",
                        "+------+------+--------------------------------+-----+",
                    ];
-                    assert_batches_sorted_eq!(&expected, &ingester_response.record_batches);
+                    assert_batches_sorted_eq!(&expected, &ingester_partition.record_batches);
                }
                .boxed()
            })),
@ -77,8 +90,15 @@ async fn persist_on_demand() {
                        .await
                        .unwrap();

+                    assert_eq!(ingester_response.partitions.len(), 1);
+                    let ingester_partition = ingester_response
+                        .partitions
+                        .into_iter()
+                        .next()
+                        .expect("just checked len");
+
                    let num_files_persisted =
-                        ingester_response.app_metadata.completed_persistence_count;
+                        ingester_partition.app_metadata.completed_persistence_count;
                    assert_eq!(num_files_persisted, 1);
                }
                .boxed()
@ -121,11 +141,17 @@ async fn ingester_flight_api() {
        .query_ingester(query.clone(), cluster.ingester().ingester_grpc_connection())
        .await
        .unwrap();
+    assert_eq!(ingester_response.partitions.len(), 1);
+    let ingester_partition = ingester_response
+        .partitions
+        .into_iter()
+        .next()
+        .expect("just checked len");

-    let ingester_uuid = ingester_response.app_metadata.ingester_uuid.clone();
+    let ingester_uuid = ingester_partition.app_metadata.ingester_uuid.clone();
    assert!(!ingester_uuid.is_empty());

-    let schema = ingester_response.schema.unwrap();
+    let schema = ingester_partition.schema.unwrap();

    let expected = [
        "+------+------+--------------------------------+-----+",
@ -135,11 +161,11 @@ async fn ingester_flight_api() {
        "| B    | A    | 1970-01-01T00:00:00.001234567Z | 84  |",
        "+------+------+--------------------------------+-----+",
    ];
-    assert_batches_sorted_eq!(&expected, &ingester_response.record_batches);
+    assert_batches_sorted_eq!(&expected, &ingester_partition.record_batches);

    // Also ensure that the schema of the batches matches what is
    // reported by the performed_query.
-    ingester_response
+    ingester_partition
        .record_batches
        .iter()
        .enumerate()
@ -152,7 +178,13 @@ async fn ingester_flight_api() {
        .query_ingester(query.clone(), cluster.ingester().ingester_grpc_connection())
        .await
        .unwrap();
-    assert_eq!(ingester_response.app_metadata.ingester_uuid, ingester_uuid);
+    assert_eq!(ingester_response.partitions.len(), 1);
+    let ingester_partition = ingester_response
+        .partitions
+        .into_iter()
+        .next()
+        .expect("just checked len");
+    assert_eq!(ingester_partition.app_metadata.ingester_uuid, ingester_uuid);

    // Restart the ingesters
    cluster.restart_ingesters().await;
@ -167,7 +199,146 @@ async fn ingester_flight_api() {
        .query_ingester(query, cluster.ingester().ingester_grpc_connection())
        .await
        .unwrap();
-    assert_ne!(ingester_response.app_metadata.ingester_uuid, ingester_uuid);
+    assert_eq!(ingester_response.partitions.len(), 1);
+    let ingester_partition = ingester_response
+        .partitions
+        .into_iter()
+        .next()
+        .expect("just checked len");
+    assert_ne!(ingester_partition.app_metadata.ingester_uuid, ingester_uuid);
+}
+
+#[tokio::test]
+async fn ingester_partition_pruning() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    // Set up cluster
+    let mut cluster = MiniCluster::create_shared_never_persist(database_url).await;
+
+    let mut steps: Vec<_> = vec![Step::Custom(Box::new(move |state: &mut StepTestState| {
+        async move {
+            let namespace_name = state.cluster().namespace();
+
+            let mut namespace_client = influxdb_iox_client::namespace::Client::new(
+                state.cluster().router().router_grpc_connection(),
+            );
+            namespace_client
+                .create_namespace(
+                    namespace_name,
+                    None,
+                    None,
+                    Some(PartitionTemplate {
+                        parts: vec![
+                            TemplatePart {
+                                part: Some(Part::TagValue("tag1".into())),
+                            },
+                            TemplatePart {
+                                part: Some(Part::TagValue("tag3".into())),
+                            },
+                        ],
+                    }),
+                )
+                .await
+                .unwrap();
+
+            let mut table_client = influxdb_iox_client::table::Client::new(
+                state.cluster().router().router_grpc_connection(),
+            );
+
+            // table1: create implicitly by writing to it
+
+            // table2: do not override partition template => use namespace template
+            table_client
+                .create_table(namespace_name, "table2", None)
+                .await
+                .unwrap();
+
+            // table3: overide namespace template
+            table_client
+                .create_table(
+                    namespace_name,
+                    "table3",
+                    Some(PartitionTemplate {
+                        parts: vec![TemplatePart {
+                            part: Some(Part::TagValue("tag2".into())),
+                        }],
+                    }),
+                )
+                .await
+                .unwrap();
+        }
+        .boxed()
+    }))]
+    .into_iter()
+    .chain((1..=3).flat_map(|tid| {
+        [Step::WriteLineProtocol(
+            [
+                format!("table{tid},tag1=v1a,tag2=v2a,tag3=v3a f=1 11"),
+                format!("table{tid},tag1=v1b,tag2=v2a,tag3=v3a f=1 11"),
+                format!("table{tid},tag1=v1a,tag2=v2b,tag3=v3a f=1 11"),
+                format!("table{tid},tag1=v1b,tag2=v2b,tag3=v3a f=1 11"),
+                format!("table{tid},tag1=v1a,tag2=v2a,tag3=v3b f=1 11"),
+                format!("table{tid},tag1=v1b,tag2=v2a,tag3=v3b f=1 11"),
+                format!("table{tid},tag1=v1a,tag2=v2b,tag3=v3b f=1 11"),
+                format!("table{tid},tag1=v1b,tag2=v2b,tag3=v3b f=1 11"),
+            ]
+            .join("\n"),
+        )]
+        .into_iter()
+    }))
+    .collect();
+
+    steps.push(Step::Custom(Box::new(move |state: &mut StepTestState| {
+        async move {
+            // Note: The querier will perform correct type coercion. We must simulate this here, otherwise the ingester
+            //       will NOT be able to prune the data because the predicate evaluation will fail with a type error
+            //       and the predicate will be ignored.
+            let predicate = ::predicate::Predicate::new().with_expr(col("tag1").eq(lit(
+                ScalarValue::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(ScalarValue::from("v1a")),
+                ),
+            )));
+
+            let query = IngesterQueryRequest::new(
+                state.cluster().namespace_id().await,
+                state.cluster().table_id("table1").await,
+                vec![],
+                Some(predicate),
+            );
+
+            let query: proto::IngesterQueryRequest = query.try_into().unwrap();
+            let ingester_response = state
+                .cluster()
+                .query_ingester(
+                    query.clone(),
+                    state.cluster().ingester().ingester_grpc_connection(),
+                )
+                .await
+                .unwrap();
+
+            let expected = [
+                "+-----+------+------+------+--------------------------------+",
+                "| f   | tag1 | tag2 | tag3 | time                           |",
+                "+-----+------+------+------+--------------------------------+",
+                "| 1.0 | v1a  | v2a  | v3a  | 1970-01-01T00:00:00.000000011Z |",
+                "| 1.0 | v1a  | v2a  | v3b  | 1970-01-01T00:00:00.000000011Z |",
+                "| 1.0 | v1a  | v2b  | v3a  | 1970-01-01T00:00:00.000000011Z |",
+                "| 1.0 | v1a  | v2b  | v3b  | 1970-01-01T00:00:00.000000011Z |",
+                "+-----+------+------+------+--------------------------------+",
+            ];
+            let record_batches = ingester_response
+                .partitions
+                .into_iter()
+                .flat_map(|p| p.record_batches)
+                .collect::<Vec<_>>();
+            assert_batches_sorted_eq!(&expected, &record_batches);
+        }
+        .boxed()
+    })));
+
+    StepTest::new(&mut cluster, steps).run().await
 }

 #[tokio::test]
--- a/influxdb_iox/tests/end_to_end_cases/querier.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier.rs
@ -299,6 +299,48 @@ async fn query_after_persist_sees_new_files() {
    StepTest::new(&mut cluster, steps).run().await
 }

+#[tokio::test]
+async fn query_after_shutdown_sees_new_files() {
+    test_helpers::maybe_start_logging();
+    let database_url = maybe_skip_integration!();
+
+    // Configure a cluster such that the ingester never persists (until
+    // shutdown)
+    let ingester_config = TestConfig::new_ingester_never_persist(&database_url);
+    let router_config = TestConfig::new_router(&ingester_config);
+    // Querier configured to quickly consider ingesters dead to speed up the
+    // test.
+    let querier_config =
+        TestConfig::new_querier(&ingester_config).with_querier_circuit_breaker_threshold(1);
+
+    let mut cluster = MiniCluster::new()
+        .with_ingester(ingester_config)
+        .await
+        .with_router(router_config)
+        .await
+        .with_querier(querier_config)
+        .await;
+
+    let steps = vec![
+        Step::WriteLineProtocol("bananas,tag1=A,tag2=B val=42i 123456".to_string()),
+        Step::AssertNumParquetFiles { expected: 0 }, // test invariant
+        Step::GracefulStopIngesters,
+        Step::AssertNumParquetFiles { expected: 1 },
+        Step::Query {
+            sql: "select * from bananas".to_string(),
+            expected: vec![
+                "+------+------+--------------------------------+-----+",
+                "| tag1 | tag2 | time                           | val |",
+                "+------+------+--------------------------------+-----+",
+                "| A    | B    | 1970-01-01T00:00:00.000123456Z | 42  |",
+                "+------+------+--------------------------------+-----+",
+            ],
+        },
+    ];
+
+    StepTest::new(&mut cluster, steps).run().await
+}
+
 #[tokio::test]
 async fn table_not_found_on_ingester() {
    test_helpers::maybe_start_logging();
--- a/influxdb_iox/tests/end_to_end_cases/querier/multi_ingester.rs
+++ b/influxdb_iox/tests/end_to_end_cases/querier/multi_ingester.rs
@ -193,7 +193,14 @@ async fn write_replication() {
                    .await
                    .unwrap();

-                let ingester_uuid = ingester_response.app_metadata.ingester_uuid;
+                assert_eq!(ingester_response.partitions.len(), 1);
+                let ingester_partition = ingester_response
+                    .partitions
+                    .into_iter()
+                    .next()
+                    .expect("just checked len");
+
+                let ingester_uuid = ingester_partition.app_metadata.ingester_uuid;
                assert!(!ingester_uuid.is_empty());

                let expected = [
@ -212,7 +219,7 @@ async fn write_replication() {
                    "| A    | B    | 1970-01-01T00:00:00.000000020Z | 20  |",
                    "+------+------+--------------------------------+-----+",
                ];
-                assert_batches_sorted_eq!(&expected, &ingester_response.record_batches);
+                assert_batches_sorted_eq!(&expected, &ingester_partition.record_batches);
            }
            .boxed()
        })));
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@ -24,10 +24,10 @@ prost = "0.11"
 rand = "0.8.3"
 reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
 schema = { path = "../schema" }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread"] }
 tokio-stream = "0.1.13"
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tonic = { workspace = true }

 [dev-dependencies]
--- a/influxdb_tsm/Cargo.toml
+++ b/influxdb_tsm/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies] # In alphabetical order
-integer-encoding = "3.0.4"
+integer-encoding = "4.0.0"
 snafu = "0.7"
 snap = "1.1.0"
 observability_deps = { path = "../observability_deps" }
--- a/ingester/Cargo.toml
+++ b/ingester/Cargo.toml
@ -10,7 +10,7 @@ arrow = { workspace = true, features = ["prettyprint"] }
 arrow_util = { version = "0.1.0", path = "../arrow_util" }
 arrow-flight = { workspace = true }
 async-channel = "1.8.0"
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { version = "0.1.0", path = "../backoff" }
 bytes = "1.4.0"
 crossbeam-utils = "0.8.16"
@ -31,7 +31,7 @@ observability_deps = { version = "0.1.0", path = "../observability_deps" }
 once_cell = "1.18"
 parking_lot = "0.12.1"
 parquet_file = { version = "0.1.0", path = "../parquet_file" }
-pin-project = "1.1.1"
+pin-project = "1.1.2"
 predicate = { version = "0.1.0", path = "../predicate" }
 prost = { version = "0.11.9", default-features = false, features = ["std"] }
 rand = "0.8.5"
@ -39,7 +39,7 @@ schema = { version = "0.1.0", path = "../schema" }
 service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
 sharder = { version = "0.1.0", path = "../sharder" }
 test_helpers = { path = "../test_helpers", features = ["future_timeout"], optional = true }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tracker = { path = "../tracker" }
 tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 tokio-util = "0.7.8"
@ -58,7 +58,7 @@ ingester_test_ctx = { path = "../ingester_test_ctx" }
 lazy_static = "1.4.0"
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 object_store = { workspace = true }
-paste = "1.0.12"
+paste = "1.0.13"
 tempfile = "3.6.0"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
 tokio = { version = "1.29", features = ["macros", "time", "test-util"] }
@ -81,3 +81,7 @@ name = "write"
 harness = false
 # Require some internal types be made visible for benchmark code.
 required-features = ["benches"]
+
+[[bench]]
+name = "query"
+harness = false
--- a/ingester/benches/query.rs
+++ b/ingester/benches/query.rs
@ -0,0 +1,102 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use data_types::{NamespaceId, PartitionKey, TableId};
+use ingester::IngesterRpcInterface;
+use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
+use ingester_test_ctx::{TestContext, TestContextBuilder};
+use std::fmt::Write;
+
+const TEST_NAMESPACE: &str = "bananas";
+const PARTITION_KEY: &str = "platanos";
+
+fn generate_table_data(rows: usize, cols: usize) -> String {
+    let mut buf = String::new();
+    for i in 0..rows {
+        write!(&mut buf, "bananas ").unwrap();
+        for j in 0..(cols - 1) {
+            write!(&mut buf, "v{j}={i}{j},").unwrap();
+        }
+        writeln!(&mut buf, "v{cols}={i}{cols} 42{i}").unwrap();
+    }
+
+    buf
+}
+
+/// Return an initialised and pre-warmed ingester instance backed by a catalog
+/// correctly populated to accept writes of `lp`.
+async fn init(
+    lp: impl AsRef<str>,
+) -> (TestContext<impl IngesterRpcInterface>, NamespaceId, TableId) {
+    let lp = lp.as_ref();
+
+    let mut ctx = TestContextBuilder::default()
+        // Don't stop ingest during benchmarks
+        .with_max_persist_queue_depth(10_000_000)
+        .with_persist_hot_partition_cost(10_000_000_000)
+        .build()
+        .await;
+
+    // Ensure the namespace exists in the catalog.
+    let ns = ctx.ensure_namespace(TEST_NAMESPACE, None).await;
+
+    // Write the test data
+    ctx.write_lp(TEST_NAMESPACE, lp, PartitionKey::from(PARTITION_KEY), 42)
+        .await;
+
+    let table_id = ctx.table_id(TEST_NAMESPACE, "bananas").await;
+
+    (ctx, ns.id, table_id)
+}
+
+fn bench_query(c: &mut Criterion) {
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .expect("failed to initialise tokio runtime for benchmark");
+
+    for (rows, cols) in [(100_000, 10), (100_000, 100), (100_000, 200)] {
+        run_bench("no projection", rows, cols, vec![], &runtime, c);
+        run_bench(
+            "project 1 column",
+            rows,
+            cols,
+            vec!["time".to_string()],
+            &runtime,
+            c,
+        );
+    }
+}
+
+fn run_bench(
+    name: &str,
+    rows: usize,
+    cols: usize,
+    projection: Vec<String>,
+    runtime: &tokio::runtime::Runtime,
+    c: &mut Criterion,
+) {
+    let lp = generate_table_data(rows, cols);
+    let (ctx, namespace_id, table_id) = runtime.block_on(init(lp));
+
+    let mut group = c.benchmark_group("query");
+    group.throughput(Throughput::Elements(1)); // Queries per second
+    group.bench_function(
+        BenchmarkId::new(name, format!("rows_{rows}_cols{cols}")),
+        |b| {
+            let ctx = &ctx;
+            let projection = &projection;
+            b.to_async(runtime).iter(|| async move {
+                ctx.query(IngesterQueryRequest {
+                    namespace_id: namespace_id.get(),
+                    table_id: table_id.get(),
+                    columns: projection.clone(),
+                    predicate: None,
+                })
+                .await
+                .expect("query request failed");
+            });
+        },
+    );
+}
+
+criterion_group!(benches, bench_query);
+criterion_main!(benches);
--- a/ingester/src/buffer_tree/namespace.rs
+++ b/ingester/src/buffer_tree/namespace.rs
@ -7,19 +7,23 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
 use metric::U64Counter;
+use predicate::Predicate;
 use trace::span::Span;

 use super::{
    partition::resolver::PartitionProvider,
    post_write::PostWriteObserver,
-    table::{name_resolver::TableNameProvider, TableData},
+    table::{metadata_resolver::TableProvider, TableData},
 };
 use crate::{
    arcmap::ArcMap,
    deferred_load::DeferredLoad,
    dml_payload::IngestOp,
    dml_sink::DmlSink,
-    query::{response::QueryResponse, tracing::QueryExecTracing, QueryError, QueryExec},
+    query::{
+        projection::OwnedProjection, response::QueryResponse, tracing::QueryExecTracing,
+        QueryError, QueryExec,
+    },
 };

 /// The string name / identifier of a Namespace.
@ -60,12 +64,13 @@ pub(crate) struct NamespaceData<O> {
    /// A set of tables this [`NamespaceData`] instance has processed
    /// [`IngestOp`]'s for.
    ///
-    /// The [`TableNameProvider`] acts as a [`DeferredLoad`] constructor to
-    /// resolve the [`TableName`] for new [`TableData`] out of the hot path.
+    /// The [`TableProvider`] acts as a [`DeferredLoad`] constructor to
+    /// resolve the catalog [`Table`] for new [`TableData`] out of the hot path.
    ///
-    /// [`TableName`]: crate::buffer_tree::table::TableName
+    ///
+    /// [`Table`]: data_types::Table
    tables: ArcMap<TableId, TableData<O>>,
-    table_name_resolver: Arc<dyn TableNameProvider>,
+    catalog_table_resolver: Arc<dyn TableProvider>,
    /// The count of tables initialised in this Ingester so far, across all
    /// namespaces.
    table_count: U64Counter,
@ -83,7 +88,7 @@ impl<O> NamespaceData<O> {
    pub(super) fn new(
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
-        table_name_resolver: Arc<dyn TableNameProvider>,
+        catalog_table_resolver: Arc<dyn TableProvider>,
        partition_provider: Arc<dyn PartitionProvider>,
        post_write_observer: Arc<O>,
        metrics: &metric::Registry,
@ -99,7 +104,7 @@ impl<O> NamespaceData<O> {
            namespace_id,
            namespace_name,
            tables: Default::default(),
-            table_name_resolver,
+            catalog_table_resolver,
            table_count,
            partition_provider,
            post_write_observer,
@ -151,7 +156,7 @@ where
                        self.table_count.inc(1);
                        Arc::new(TableData::new(
                            table_id,
-                            Arc::new(self.table_name_resolver.for_table(table_id)),
+                            Arc::new(self.catalog_table_resolver.for_table(table_id)),
                            self.namespace_id,
                            Arc::clone(&self.namespace_name),
                            Arc::clone(&self.partition_provider),
@ -187,8 +192,9 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        assert_eq!(
            self.namespace_id, namespace_id,
@ -204,7 +210,7 @@ where
        // a tracing delegate to emit a child span.
        Ok(QueryResponse::new(
            QueryExecTracing::new(inner, "table")
-                .query_exec(namespace_id, table_id, columns, span)
+                .query_exec(namespace_id, table_id, projection, span, predicate)
                .await?,
        ))
    }
@ -226,7 +232,7 @@ mod tests {
        test_util::{
            defer_namespace_name_1_ms, make_write_op, PartitionDataBuilder, ARBITRARY_NAMESPACE_ID,
            ARBITRARY_NAMESPACE_NAME, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_ID,
-            ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_NAME_PROVIDER,
+            ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_PROVIDER,
        },
    };

@ -243,7 +249,7 @@ mod tests {
        let ns = NamespaceData::new(
            ARBITRARY_NAMESPACE_ID,
            defer_namespace_name_1_ms(),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            &metrics,
--- a/ingester/src/buffer_tree/partition.rs
+++ b/ingester/src/buffer_tree/partition.rs
@ -14,8 +14,10 @@ use self::{
    buffer::{traits::Queryable, BufferState, DataBuffer, Persisting},
    persisting::{BatchIdent, PersistingData},
 };
-use super::{namespace::NamespaceName, table::TableName};
-use crate::{deferred_load::DeferredLoad, query_adaptor::QueryAdaptor};
+use super::{namespace::NamespaceName, table::TableMetadata};
+use crate::{
+    deferred_load::DeferredLoad, query::projection::OwnedProjection, query_adaptor::QueryAdaptor,
+};

 mod buffer;
 pub(crate) mod persisting;
@ -73,9 +75,9 @@ pub struct PartitionData {

    /// The catalog ID for the table this partition is part of.
    table_id: TableId,
-    /// The name of the table this partition is part of, potentially unresolved
+    /// The catalog metadata for the table this partition is part of, potentially unresolved
    /// / deferred.
-    table_name: Arc<DeferredLoad<TableName>>,
+    table: Arc<DeferredLoad<TableMetadata>>,

    /// A [`DataBuffer`] for incoming writes.
    buffer: DataBuffer,
@ -108,7 +110,7 @@ impl PartitionData {
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
        sort_key: SortKeyState,
    ) -> Self {
        Self {
@ -119,7 +121,7 @@ impl PartitionData {
            namespace_id,
            namespace_name,
            table_id,
-            table_name,
+            table,
            buffer: DataBuffer::default(),
            persisting: VecDeque::with_capacity(1),
            started_persistence_count: BatchIdent::default(),
@ -139,7 +141,7 @@ impl PartitionData {
        trace!(
            namespace_id = %self.namespace_id,
            table_id = %self.table_id,
-            table_name = %self.table_name,
+            table = %self.table,
            partition_id = %self.partition_id,
            partition_key = %self.partition_key,
            "buffered write"
@ -156,9 +158,9 @@ impl PartitionData {

    /// Return all data for this partition, ordered by the calls to
    /// [`PartitionData::buffer_write()`].
-    pub(crate) fn get_query_data(&mut self) -> Option<QueryAdaptor> {
+    pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Option<QueryAdaptor> {
        // Extract the buffered data, if any.
-        let buffered_data = self.buffer.get_query_data();
+        let buffered_data = self.buffer.get_query_data(projection);

        // Prepend any currently persisting batches.
        //
@ -168,14 +170,14 @@ impl PartitionData {
        let data = self
            .persisting
            .iter()
-            .flat_map(|(_, b)| b.get_query_data())
+            .flat_map(|(_, b)| b.get_query_data(projection))
            .chain(buffered_data)
            .collect::<Vec<_>>();

        trace!(
            namespace_id = %self.namespace_id,
            table_id = %self.table_id,
-            table_name = %self.table_name,
+            table = %self.table,
            partition_id = %self.partition_id,
            partition_key = %self.partition_key,
            n_batches = data.len(),
@ -221,7 +223,7 @@ impl PartitionData {
        debug!(
            namespace_id = %self.namespace_id,
            table_id = %self.table_id,
-            table_name = %self.table_name,
+            table = %self.table,
            partition_id = %self.partition_id,
            partition_key = %self.partition_key,
            %batch_ident,
@ -230,7 +232,10 @@ impl PartitionData {

        // Wrap the persisting data in the type wrapper
        let data = PersistingData::new(
-            QueryAdaptor::new(self.partition_id, fsm.get_query_data()),
+            QueryAdaptor::new(
+                self.partition_id,
+                fsm.get_query_data(&OwnedProjection::default()),
+            ),
            batch_ident,
        );

@ -271,7 +276,7 @@ impl PartitionData {
            persistence_count = %self.completed_persistence_count,
            namespace_id = %self.namespace_id,
            table_id = %self.table_id,
-            table_name = %self.table_name,
+            table = %self.table,
            partition_id = %self.partition_id,
            partition_key = %self.partition_key,
            batch_ident = %batch.batch_ident(),
@ -302,10 +307,10 @@ impl PartitionData {
        self.completed_persistence_count
    }

-    /// Return the name of the table this [`PartitionData`] is buffering writes
+    /// Return the metadata of the table this [`PartitionData`] is buffering writes
    /// for.
-    pub(crate) fn table_name(&self) -> &Arc<DeferredLoad<TableName>> {
-        &self.table_name
+    pub(crate) fn table(&self) -> &Arc<DeferredLoad<TableMetadata>> {
+        &self.table
    }

    /// Return the table ID for this partition.
@ -349,7 +354,7 @@ impl PartitionData {

 #[cfg(test)]
 mod tests {
-    use std::{ops::Deref, time::Duration};
+    use std::time::Duration;

    use arrow::compute::SortOptions;
    use arrow_util::assert_batches_eq;
@ -378,7 +383,7 @@ mod tests {
        let mut p = PartitionDataBuilder::new().build();

        // And no data should be returned when queried.
-        assert!(p.get_query_data().is_none());
+        assert!(p.get_query_data(&OwnedProjection::default()).is_none());

        // Perform a single write.
        let mb = lp_to_mutable_batch(r#"bananas,city=London people=2,pigeons="millions" 10"#).1;
@ -387,7 +392,9 @@ mod tests {

        // The data should be readable.
        {
-            let data = p.get_query_data().expect("should return data");
+            let data = p
+                .get_query_data(&OwnedProjection::default())
+                .expect("should return data");
            assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);

            let expected = [
@ -397,15 +404,7 @@ mod tests {
                "| London | 2.0    | millions | 1970-01-01T00:00:00.000000010Z |",
                "+--------+--------+----------+--------------------------------+",
            ];
-            assert_batches_eq!(
-                expected,
-                &*data
-                    .record_batches()
-                    .iter()
-                    .map(Deref::deref)
-                    .cloned()
-                    .collect::<Vec<_>>()
-            );
+            assert_batches_eq!(expected, data.record_batches());
        }

        // Perform a another write, adding data to the existing queryable data
@ -416,7 +415,9 @@ mod tests {

        // And finally both writes should be readable.
        {
-            let data = p.get_query_data().expect("should contain data");
+            let data = p
+                .get_query_data(&OwnedProjection::default())
+                .expect("should contain data");
            assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);

            let expected = [
@ -427,15 +428,7 @@ mod tests {
                "| Madrid | 4.0    | none     | 1970-01-01T00:00:00.000000020Z |",
                "+--------+--------+----------+--------------------------------+",
            ];
-            assert_batches_eq!(
-                expected,
-                &*data
-                    .record_batches()
-                    .iter()
-                    .map(Deref::deref)
-                    .cloned()
-                    .collect::<Vec<_>>()
-            );
+            assert_batches_eq!(expected, data.record_batches());
        }
    }

@ -445,7 +438,7 @@ mod tests {
    async fn test_persist() {
        let mut p = PartitionDataBuilder::new().build();

-        assert!(p.get_query_data().is_none());
+        assert!(p.get_query_data(&OwnedProjection::default()).is_none());

        // Perform a single write.
        let mb = lp_to_mutable_batch(r#"bananas,city=London people=2,pigeons="millions" 10"#).1;
@ -468,15 +461,7 @@ mod tests {
            "| London | 2.0    | millions | 1970-01-01T00:00:00.000000010Z |",
            "+--------+--------+----------+--------------------------------+",
        ];
-        assert_batches_eq!(
-            expected,
-            &*persisting_data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
-        );
+        assert_batches_eq!(expected, persisting_data.record_batches());

        // Ensure the started batch ident is increased after a persist call, but not the completed
        // batch ident.
@ -492,7 +477,9 @@ mod tests {

        // Which must be readable, alongside the ongoing persist data.
        {
-            let data = p.get_query_data().expect("must have data");
+            let data = p
+                .get_query_data(&OwnedProjection::default())
+                .expect("must have data");
            assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
            assert_eq!(data.record_batches().len(), 2);
            let expected = [
@ -503,15 +490,7 @@ mod tests {
                "| Madrid | 4.0    | none     | 1970-01-01T00:00:00.000000020Z |",
                "+--------+--------+----------+--------------------------------+",
            ];
-            assert_batches_eq!(
-                expected,
-                &*data
-                    .record_batches()
-                    .iter()
-                    .map(Deref::deref)
-                    .cloned()
-                    .collect::<Vec<_>>()
-            );
+            assert_batches_eq!(expected, data.record_batches());
        }

        // The persist now "completes".
@ -526,7 +505,9 @@ mod tests {

        // Querying the buffer should now return only the second write.
        {
-            let data = p.get_query_data().expect("must have data");
+            let data = p
+                .get_query_data(&OwnedProjection::default())
+                .expect("must have data");
            assert_eq!(data.partition_id(), ARBITRARY_PARTITION_ID);
            assert_eq!(data.record_batches().len(), 1);
            let expected = [
@ -536,15 +517,7 @@ mod tests {
                "| Madrid | 4.0    | none    | 1970-01-01T00:00:00.000000020Z |",
                "+--------+--------+---------+--------------------------------+",
            ];
-            assert_batches_eq!(
-                expected,
-                &*data
-                    .record_batches()
-                    .iter()
-                    .map(Deref::deref)
-                    .cloned()
-                    .collect::<Vec<_>>()
-            );
+            assert_batches_eq!(expected, data.record_batches());
        }
    }

@ -557,12 +530,7 @@ mod tests {
        // A helper function to dedupe the record batches in [`QueryAdaptor`]
        // and assert the resulting batch contents.
        async fn assert_deduped(expect: &[&str], batch: QueryAdaptor) {
-            let batch = batch
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>();
+            let batch = batch.record_batches().to_vec();

            let sort_keys = vec![PhysicalSortExpr {
                expr: col("time", &batch[0].schema()).unwrap(),
@ -596,7 +564,13 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(1))
            .expect("write should succeed");

-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            1
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -605,7 +579,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 1.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -614,7 +588,13 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(2))
            .expect("write should succeed");

-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            1
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -623,7 +603,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 2.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -656,7 +636,13 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(3))
            .expect("write should succeed");

-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 2);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            2
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -665,7 +651,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 3.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -697,7 +683,13 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(3))
            .expect("write should succeed");

-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 3);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            3
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -706,7 +698,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -717,7 +709,13 @@ mod tests {
        assert!(set.contains(SequenceNumber::new(2)));

        // And assert the correct value remains.
-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 2);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            2
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -726,7 +724,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -736,7 +734,13 @@ mod tests {
        assert!(set.contains(SequenceNumber::new(3)));

        // And assert the correct value remains.
-        assert_eq!(p.get_query_data().unwrap().record_batches().len(), 1);
+        assert_eq!(
+            p.get_query_data(&OwnedProjection::default())
+                .unwrap()
+                .record_batches()
+                .len(),
+            1
+        );
        assert_deduped(
            &[
                "+--------------------------------+-----+",
@ -745,7 +749,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            p.get_query_data().unwrap(),
+            p.get_query_data(&OwnedProjection::default()).unwrap(),
        )
        .await;

@ -777,7 +781,7 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(3))
            .expect("write should succeed");

-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -787,12 +791,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 2.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );

        // Persist again, moving the last write to the persisting state and
@ -805,7 +804,7 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(4))
            .expect("write should succeed");

-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -816,12 +815,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 3.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );

        // Persist again, moving the last write to the persisting state and
@ -834,7 +828,7 @@ mod tests {
        p.buffer_write(mb, SequenceNumber::new(5))
            .expect("write should succeed");

-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -846,12 +840,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );

        // Finish persisting the second batch out-of-order! The middle entry,
@ -860,7 +849,7 @@ mod tests {
        assert_eq!(set.len(), 1);
        assert!(set.contains(SequenceNumber::new(3)));

-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -871,12 +860,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );

        // Finish persisting the last batch.
@ -884,7 +868,7 @@ mod tests {
        assert_eq!(set.len(), 1);
        assert!(set.contains(SequenceNumber::new(4)));

-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -894,12 +878,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );

        // Finish persisting the first batch.
@ -908,7 +887,7 @@ mod tests {
        assert!(set.contains(SequenceNumber::new(1)));

        // Assert only the buffered data remains
-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------------------------------+-----+",
@ -917,12 +896,7 @@ mod tests {
                "| 1970-01-01T00:00:00.000000042Z | 4.0 |",
                "+--------------------------------+-----+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );
    }

@ -1009,7 +983,7 @@ mod tests {
        );

        // Nothing should explode, data should be readable.
-        let data = p.get_query_data().unwrap();
+        let data = p.get_query_data(&OwnedProjection::default()).unwrap();
        assert_batches_eq!(
            [
                "+--------+--------+----------+--------------------------------+",
@ -1019,12 +993,7 @@ mod tests {
                "| Madrid | 2.0    | none     | 1970-01-01T00:00:00.000000011Z |",
                "+--------+--------+----------+--------------------------------+",
            ],
-            &*data
-                .record_batches()
-                .iter()
-                .map(Deref::deref)
-                .cloned()
-                .collect::<Vec<_>>()
+            &*data.record_batches().to_vec()
        );
    }

@ -1053,6 +1022,6 @@ mod tests {
    async fn test_empty_partition_no_queryadaptor_panic() {
        let mut p = PartitionDataBuilder::new().build();

-        assert!(p.get_query_data().is_none());
+        assert!(p.get_query_data(&OwnedProjection::default()).is_none());
    }
 }
--- a/ingester/src/buffer_tree/partition/buffer.rs
+++ b/ingester/src/buffer_tree/partition/buffer.rs
@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;
 use data_types::SequenceNumber;
 use mutable_batch::MutableBatch;
@ -11,6 +9,8 @@ pub(crate) mod traits;

 pub(crate) use state_machine::*;

+use crate::query::projection::OwnedProjection;
+
 use self::{always_some::AlwaysSome, traits::Queryable};

 /// The current state of the [`BufferState`] state machine.
@ -63,12 +63,12 @@ impl DataBuffer {

    /// Return all data for this buffer, ordered by the [`SequenceNumber`] from
    /// which it was buffered with.
-    pub(crate) fn get_query_data(&mut self) -> Vec<Arc<RecordBatch>> {
+    pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Vec<RecordBatch> {
        // Take ownership of the FSM and return the data within it.
        self.0.mutate(|fsm| match fsm {
            // The buffering state can return data.
            FsmState::Buffering(b) => {
-                let ret = b.get_query_data();
+                let ret = b.get_query_data(projection);
                (FsmState::Buffering(b), ret)
            }
        })
--- a/ingester/src/buffer_tree/partition/buffer/mutable_buffer.rs
+++ b/ingester/src/buffer_tree/partition/buffer/mutable_buffer.rs
@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;
 use mutable_batch::MutableBatch;
 use schema::Projection;
@ -39,12 +37,12 @@ impl Buffer {
    /// # Panics
    ///
    /// If generating the snapshot fails, this method panics.
-    pub(super) fn snapshot(self) -> Option<Arc<RecordBatch>> {
-        Some(Arc::new(
+    pub(super) fn snapshot(self) -> Option<RecordBatch> {
+        Some(
            self.buffer?
                .to_arrow(Projection::All)
                .expect("failed to snapshot buffer data"),
-        ))
+        )
    }

    pub(super) fn is_empty(&self) -> bool {
--- a/ingester/src/buffer_tree/partition/buffer/state_machine.rs
+++ b/ingester/src/buffer_tree/partition/buffer/state_machine.rs
@ -1,6 +1,4 @@
 #![allow(dead_code)]
-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;
 use data_types::{sequence_number_set::SequenceNumberSet, SequenceNumber};
 use mutable_batch::MutableBatch;
@ -12,6 +10,8 @@ mod snapshot;
 pub(in crate::buffer_tree::partition::buffer) use buffering::*;
 pub(crate) use persisting::*;

+use crate::query::projection::OwnedProjection;
+
 use super::traits::{Queryable, Writeable};

 /// A result type for fallible transitions.
@ -122,14 +122,14 @@ where
    /// Returns the current buffer data.
    ///
    /// This is always a cheap method call.
-    fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
-        self.state.get_query_data()
+    fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
+        self.state.get_query_data(projection)
    }
 }

 #[cfg(test)]
 mod tests {
-    use std::ops::Deref;
+    use std::sync::Arc;

    use arrow_util::assert_batches_eq;
    use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
@ -139,6 +139,8 @@ mod tests {
    use super::*;

    #[test]
+    // comparing dyn Array always has same vtable, so is accurate to use Arc::ptr_eq
+    #[allow(clippy::vtable_address_comparisons)]
    fn test_buffer_lifecycle() {
        // Initialise a buffer in the base state.
        let mut buffer: BufferState<Buffering> = BufferState::new();
@ -166,7 +168,7 @@ mod tests {
        // Keep the data to validate they are ref-counted copies after further
        // writes below. Note this construct allows the caller to decide when/if
        // to allocate.
-        let w1_data = buffer.get_query_data();
+        let w1_data = buffer.get_query_data(&OwnedProjection::default());

        let expected = vec![
            "+-------+----------+----------+--------------------------------+",
@ -175,7 +177,7 @@ mod tests {
            "| true  | 42.0     | platanos | 1991-03-10T00:00:42.000000042Z |",
            "+-------+----------+----------+--------------------------------+",
        ];
-        assert_batches_eq!(&expected, &[w1_data[0].deref().clone()]);
+        assert_batches_eq!(&expected, &[w1_data[0].clone()]);

        // Apply another write.
        buffer
@ -195,7 +197,7 @@ mod tests {
        };

        // Verify the writes are still queryable.
-        let w2_data = buffer.get_query_data();
+        let w2_data = buffer.get_query_data(&OwnedProjection::default());
        let expected = vec![
            "+-------+----------+----------+--------------------------------+",
            "| great | how_much | tag      | time                           |",
@ -205,18 +207,18 @@ mod tests {
            "+-------+----------+----------+--------------------------------+",
        ];
        assert_eq!(w2_data.len(), 1);
-        assert_batches_eq!(&expected, &[w2_data[0].deref().clone()]);
+        assert_batches_eq!(&expected, &[w2_data[0].clone()]);

        // Ensure the same data is returned for a second read.
        {
-            let second_read = buffer.get_query_data();
+            let second_read = buffer.get_query_data(&OwnedProjection::default());
            assert_eq!(w2_data, second_read);

            // And that no data was actually copied.
            let same_arcs = w2_data
                .iter()
                .zip(second_read.iter())
-                .all(|(a, b)| Arc::ptr_eq(a, b));
+                .all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
            assert!(same_arcs);
        }

@ -224,14 +226,120 @@ mod tests {
        let buffer: BufferState<Persisting> = buffer.into_persisting();

        // Extract the final buffered result
-        let final_data = buffer.get_query_data();
+        let final_data = buffer.get_query_data(&OwnedProjection::default());

        // And once again verify no data was changed, copied or re-ordered.
        assert_eq!(w2_data, final_data);
        let same_arcs = w2_data
            .into_iter()
            .zip(final_data.into_iter())
-            .all(|(a, b)| Arc::ptr_eq(&a, &b));
+            .all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
+        assert!(same_arcs);
+
+        // Assert the sequence numbers were recorded.
+        let set = buffer.into_sequence_number_set();
+        assert!(set.contains(SequenceNumber::new(0)));
+        assert!(set.contains(SequenceNumber::new(1)));
+        assert_eq!(set.len(), 2);
+    }
+
+    /// Assert projection is correct across all the queryable FSM states.
+    #[test]
+    // comparing dyn Array always has same vtable, so is accurate to use Arc::ptr_eq
+    #[allow(clippy::vtable_address_comparisons)]
+    fn test_buffer_projection() {
+        let projection = OwnedProjection::from(vec![
+            "tag".to_string(),
+            "great".to_string(),
+            "missing".to_string(),
+            "time".to_string(),
+        ]);
+
+        // Initialise a buffer in the base state.
+        let mut buffer: BufferState<Buffering> = BufferState::new();
+
+        // Write some data to a buffer.
+        buffer
+            .write(
+                lp_to_mutable_batch(
+                    r#"bananas,tag=platanos great=true,how_much=42 668563242000000042"#,
+                )
+                .1,
+                SequenceNumber::new(0),
+            )
+            .expect("write to empty buffer should succeed");
+
+        // Extract the queryable data from the buffer and validate it.
+        //
+        // Keep the data to validate they are ref-counted copies after further
+        // writes below. Note this construct allows the caller to decide when/if
+        // to allocate.
+        let w1_data = buffer.get_query_data(&projection);
+
+        let expected = vec![
+            "+----------+-------+--------------------------------+",
+            "| tag      | great | time                           |",
+            "+----------+-------+--------------------------------+",
+            "| platanos | true  | 1991-03-10T00:00:42.000000042Z |",
+            "+----------+-------+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &[w1_data[0].clone()]);
+
+        // Apply another write.
+        buffer
+            .write(
+                lp_to_mutable_batch(
+                    r#"bananas,tag=platanos great=true,how_much=1000 668563242000000043"#,
+                )
+                .1,
+                SequenceNumber::new(1),
+            )
+            .expect("write to empty buffer should succeed");
+
+        // Snapshot the buffer into an immutable, queryable data format.
+        let buffer: BufferState<Snapshot> = match buffer.snapshot() {
+            Transition::Ok(v) => v,
+            Transition::Unchanged(_) => panic!("did not transition to snapshot state"),
+        };
+
+        // Verify the writes are still queryable.
+        let w2_data = buffer.get_query_data(&projection);
+        let expected = vec![
+            "+----------+-------+--------------------------------+",
+            "| tag      | great | time                           |",
+            "+----------+-------+--------------------------------+",
+            "| platanos | true  | 1991-03-10T00:00:42.000000042Z |",
+            "| platanos | true  | 1991-03-10T00:00:42.000000043Z |",
+            "+----------+-------+--------------------------------+",
+        ];
+        assert_eq!(w2_data.len(), 1);
+        assert_batches_eq!(&expected, &[w2_data[0].clone()]);
+
+        // Ensure the same data is returned for a second read.
+        {
+            let second_read = buffer.get_query_data(&projection);
+            assert_eq!(w2_data, second_read);
+
+            // And that no data was actually copied.
+            let same_arcs = w2_data
+                .iter()
+                .zip(second_read.iter())
+                .all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
+            assert!(same_arcs);
+        }
+
+        // Finally transition into the terminal persisting state.
+        let buffer: BufferState<Persisting> = buffer.into_persisting();
+
+        // Extract the final buffered result
+        let final_data = buffer.get_query_data(&projection);
+
+        // And once again verify no data was changed, copied or re-ordered.
+        assert_eq!(w2_data, final_data);
+        let same_arcs = w2_data
+            .into_iter()
+            .zip(final_data.into_iter())
+            .all(|(a, b)| Arc::ptr_eq(a.column(0), b.column(0)));
        assert!(same_arcs);

        // Assert the sequence numbers were recorded.
@ -258,16 +366,16 @@ mod tests {
            Transition::Unchanged(_) => panic!("failed to transition"),
        };

-        assert_eq!(buffer.get_query_data().len(), 1);
+        assert_eq!(buffer.get_query_data(&OwnedProjection::default()).len(), 1);

-        let snapshot = &buffer.get_query_data()[0];
+        let snapshot = buffer.get_query_data(&OwnedProjection::default())[0].clone();

        // Generate the combined buffer from the original inputs to compare
        // against.
        mb1.extend_from(&mb2).unwrap();
        let want = mb1.to_arrow(Projection::All).unwrap();

-        assert_eq!(&**snapshot, &want);
+        assert_eq!(snapshot, want);
    }

    #[test]
--- a/ingester/src/buffer_tree/partition/buffer/state_machine/buffering.rs
+++ b/ingester/src/buffer_tree/partition/buffer/state_machine/buffering.rs
@ -1,15 +1,15 @@
 //! A write buffer.

-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;
 use mutable_batch::MutableBatch;
-use schema::Projection;

 use super::{snapshot::Snapshot, BufferState, Transition};
-use crate::buffer_tree::partition::buffer::{
-    mutable_buffer::Buffer,
-    traits::{Queryable, Writeable},
+use crate::{
+    buffer_tree::partition::buffer::{
+        mutable_buffer::Buffer,
+        traits::{Queryable, Writeable},
+    },
+    query::projection::OwnedProjection,
 };

 /// The FSM starting ingest state - a mutable buffer collecting writes.
@ -35,18 +35,11 @@ pub(crate) struct Buffering {
 /// This method panics if converting the buffered data (if any) into an Arrow
 /// [`RecordBatch`] fails (a non-transient error).
 impl Queryable for Buffering {
-    fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
-        let data = self.buffer.buffer().map(|v| {
-            Arc::new(
-                v.to_arrow(Projection::All)
-                    .expect("failed to snapshot buffer data"),
-            )
-        });
-
-        match data {
-            Some(v) => vec![v],
-            None => vec![],
-        }
+    fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
+        self.buffer
+            .buffer()
+            .map(|v| vec![projection.project_mutable_batches(v)])
+            .unwrap_or_default()
    }
 }

--- a/ingester/src/buffer_tree/partition/buffer/state_machine/persisting.rs
+++ b/ingester/src/buffer_tree/partition/buffer/state_machine/persisting.rs
@ -1,12 +1,12 @@
 //! A writfield1 buffer, with one or more snapshots.

-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;
 use data_types::sequence_number_set::SequenceNumberSet;

 use super::BufferState;
-use crate::buffer_tree::partition::buffer::traits::Queryable;
+use crate::{
+    buffer_tree::partition::buffer::traits::Queryable, query::projection::OwnedProjection,
+};

 /// An immutable set of [`RecordBatch`] in the process of being persisted.
 #[derive(Debug)]
@ -14,18 +14,18 @@ pub(crate) struct Persisting {
    /// Snapshots generated from previous buffer contents to be persisted.
    ///
    /// INVARIANT: this array is always non-empty.
-    snapshots: Vec<Arc<RecordBatch>>,
+    snapshots: Vec<RecordBatch>,
 }

 impl Persisting {
-    pub(super) fn new(snapshots: Vec<Arc<RecordBatch>>) -> Self {
+    pub(super) fn new(snapshots: Vec<RecordBatch>) -> Self {
        Self { snapshots }
    }
 }

 impl Queryable for Persisting {
-    fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
-        self.snapshots.clone()
+    fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
+        projection.project_record_batch(&self.snapshots)
    }
 }

--- a/ingester/src/buffer_tree/partition/buffer/state_machine/snapshot.rs
+++ b/ingester/src/buffer_tree/partition/buffer/state_machine/snapshot.rs
@ -1,12 +1,11 @@
 //! A writfield1 buffer, with one or more snapshots.

-use std::sync::Arc;
-
 use arrow::record_batch::RecordBatch;

 use super::BufferState;
-use crate::buffer_tree::partition::buffer::{
-    state_machine::persisting::Persisting, traits::Queryable,
+use crate::{
+    buffer_tree::partition::buffer::{state_machine::persisting::Persisting, traits::Queryable},
+    query::projection::OwnedProjection,
 };

 /// An immutable, queryable FSM state containing at least one buffer snapshot.
@ -15,19 +14,19 @@ pub(crate) struct Snapshot {
    /// Snapshots generated from previous buffer contents.
    ///
    /// INVARIANT: this array is always non-empty.
-    snapshots: Vec<Arc<RecordBatch>>,
+    snapshots: Vec<RecordBatch>,
 }

 impl Snapshot {
-    pub(super) fn new(snapshots: Vec<Arc<RecordBatch>>) -> Self {
+    pub(super) fn new(snapshots: Vec<RecordBatch>) -> Self {
        assert!(!snapshots.is_empty());
        Self { snapshots }
    }
 }

 impl Queryable for Snapshot {
-    fn get_query_data(&self) -> Vec<Arc<RecordBatch>> {
-        self.snapshots.clone()
+    fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch> {
+        projection.project_record_batch(&self.snapshots)
    }
 }

--- a/ingester/src/buffer_tree/partition/buffer/traits.rs
+++ b/ingester/src/buffer_tree/partition/buffer/traits.rs
@ -1,10 +1,12 @@
 //! Private traits for state machine states.

-use std::{fmt::Debug, sync::Arc};
+use std::fmt::Debug;

 use arrow::record_batch::RecordBatch;
 use mutable_batch::MutableBatch;

+use crate::query::projection::OwnedProjection;
+
 /// A state that can accept writes.
 pub(crate) trait Writeable: Debug {
    fn write(&mut self, batch: MutableBatch) -> Result<(), mutable_batch::Error>;
@ -13,5 +15,5 @@ pub(crate) trait Writeable: Debug {
 /// A state that can return the contents of the buffer as one or more
 /// [`RecordBatch`] instances.
 pub(crate) trait Queryable: Debug {
-    fn get_query_data(&self) -> Vec<Arc<RecordBatch>>;
+    fn get_query_data(&self, projection: &OwnedProjection) -> Vec<RecordBatch>;
 }
--- a/ingester/src/buffer_tree/partition/resolver/cache.rs
+++ b/ingester/src/buffer_tree/partition/resolver/cache.rs
@ -14,7 +14,7 @@ use crate::{
    buffer_tree::{
        namespace::NamespaceName,
        partition::{resolver::SortKeyResolver, PartitionData, SortKeyState},
-        table::TableName,
+        table::TableMetadata,
    },
    deferred_load::DeferredLoad,
 };
@ -173,7 +173,7 @@ where
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        // Use the cached PartitionKey instead of the caller's partition_key,
        // instead preferring to reuse the already-shared Arc<str> in the cache.
@ -203,7 +203,7 @@ where
                namespace_id,
                namespace_name,
                table_id,
-                table_name,
+                table,
                SortKeyState::Deferred(Arc::new(sort_key_resolver)),
            )));
        }
@ -212,13 +212,7 @@ where

        // Otherwise delegate to the catalog / inner impl.
        self.inner
-            .get_partition(
-                partition_key,
-                namespace_id,
-                namespace_name,
-                table_id,
-                table_name,
-            )
+            .get_partition(partition_key, namespace_id, namespace_name, table_id, table)
            .await
    }
 }
@ -234,7 +228,7 @@ mod tests {
    use crate::{
        buffer_tree::partition::resolver::mock::MockPartitionProvider,
        test_util::{
-            defer_namespace_name_1_sec, defer_table_name_1_sec, PartitionDataBuilder,
+            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_NAMESPACE_NAME, ARBITRARY_PARTITION_ID,
            ARBITRARY_PARTITION_KEY, ARBITRARY_PARTITION_KEY_STR, ARBITRARY_TABLE_ID,
            ARBITRARY_TABLE_NAME,
@ -270,15 +264,15 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                defer_namespace_name_1_sec(),
                ARBITRARY_TABLE_ID,
-                defer_table_name_1_sec(),
+                defer_table_metadata_1_sec(),
            )
            .await;

        assert_eq!(got.lock().partition_id(), ARBITRARY_PARTITION_ID);
        assert_eq!(got.lock().table_id(), ARBITRARY_TABLE_ID);
        assert_eq!(
-            &**got.lock().table_name().get().await,
-            &***ARBITRARY_TABLE_NAME
+            &**got.lock().table().get().await.name(),
+            &**ARBITRARY_TABLE_NAME
        );
        assert_eq!(
            &**got.lock().namespace_name().get().await,
@ -309,15 +303,15 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                defer_namespace_name_1_sec(),
                ARBITRARY_TABLE_ID,
-                defer_table_name_1_sec(),
+                defer_table_metadata_1_sec(),
            )
            .await;

        assert_eq!(got.lock().partition_id(), ARBITRARY_PARTITION_ID);
        assert_eq!(got.lock().table_id(), ARBITRARY_TABLE_ID);
        assert_eq!(
-            &**got.lock().table_name().get().await,
-            &***ARBITRARY_TABLE_NAME
+            &**got.lock().table().get().await.name(),
+            &**ARBITRARY_TABLE_NAME
        );
        assert_eq!(
            &**got.lock().namespace_name().get().await,
@ -366,15 +360,15 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                defer_namespace_name_1_sec(),
                ARBITRARY_TABLE_ID,
-                defer_table_name_1_sec(),
+                defer_table_metadata_1_sec(),
            )
            .await;

        assert_eq!(got.lock().partition_id(), other_key_id);
        assert_eq!(got.lock().table_id(), ARBITRARY_TABLE_ID);
        assert_eq!(
-            &**got.lock().table_name().get().await,
-            &***ARBITRARY_TABLE_NAME
+            &**got.lock().table().get().await.name(),
+            &**ARBITRARY_TABLE_NAME
        );
    }

@ -402,15 +396,15 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                defer_namespace_name_1_sec(),
                other_table,
-                defer_table_name_1_sec(),
+                defer_table_metadata_1_sec(),
            )
            .await;

        assert_eq!(got.lock().partition_id(), ARBITRARY_PARTITION_ID);
        assert_eq!(got.lock().table_id(), other_table);
        assert_eq!(
-            &**got.lock().table_name().get().await,
-            &***ARBITRARY_TABLE_NAME
+            &**got.lock().table().get().await.name(),
+            &**ARBITRARY_TABLE_NAME
        );
    }
 }
--- a/ingester/src/buffer_tree/partition/resolver/catalog.rs
+++ b/ingester/src/buffer_tree/partition/resolver/catalog.rs
@ -15,7 +15,7 @@ use crate::{
    buffer_tree::{
        namespace::NamespaceName,
        partition::{PartitionData, SortKeyState},
-        table::TableName,
+        table::TableMetadata,
    },
    deferred_load::DeferredLoad,
 };
@ -61,12 +61,12 @@ impl PartitionProvider for CatalogPartitionResolver {
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        debug!(
            %partition_key,
            %table_id,
-            %table_name,
+            %table,
            "upserting partition in catalog"
        );
        let p = Backoff::new(&self.backoff_config)
@ -86,7 +86,7 @@ impl PartitionProvider for CatalogPartitionResolver {
            namespace_id,
            namespace_name,
            table_id,
-            table_name,
+            table,
            SortKeyState::Provided(p.sort_key()),
        )))
    }
@ -103,6 +103,7 @@ mod tests {
    use iox_catalog::test_helpers::{arbitrary_namespace, arbitrary_table};

    use super::*;
+    use crate::buffer_tree::table::TableName;

    const TABLE_NAME: &str = "bananas";
    const NAMESPACE_NAME: &str = "ns-bananas";
@ -138,17 +139,25 @@ mod tests {
                table_id,
                Arc::new(DeferredLoad::new(
                    Duration::from_secs(1),
-                    async { TableName::from(TABLE_NAME) },
+                    async {
+                        TableMetadata::new_for_testing(
+                            TableName::from(TABLE_NAME),
+                            Default::default(),
+                        )
+                    },
                    &metrics,
                )),
            )
            .await;

        // Ensure the table name is available.
-        let _ = got.lock().table_name().get().await;
+        let _ = got.lock().table().get().await.name();

        assert_eq!(got.lock().namespace_id(), namespace_id);
-        assert_eq!(got.lock().table_name().to_string(), table_name.to_string());
+        assert_eq!(
+            got.lock().table().get().await.name().to_string(),
+            table_name.to_string()
+        );
        assert_matches!(got.lock().sort_key(), SortKeyState::Provided(None));
        assert!(got.lock().partition_key.ptr_eq(&callers_partition_key));

--- a/ingester/src/buffer_tree/partition/resolver/coalesce.rs
+++ b/ingester/src/buffer_tree/partition/resolver/coalesce.rs
@ -14,7 +14,7 @@ use hashbrown::{hash_map::Entry, HashMap};
 use parking_lot::Mutex;

 use crate::{
-    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
+    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableMetadata},
    deferred_load::DeferredLoad,
 };

@ -146,7 +146,7 @@ where
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        let key = Key {
            namespace_id,
@ -170,7 +170,7 @@ where
                    namespace_id,
                    namespace_name,
                    table_id,
-                    table_name,
+                    table,
                ));

                // Make the future poll-able by many callers, all of which
@ -233,7 +233,7 @@ async fn do_fetch<T>(
    namespace_id: NamespaceId,
    namespace_name: Arc<DeferredLoad<NamespaceName>>,
    table_id: TableId,
-    table_name: Arc<DeferredLoad<TableName>>,
+    table: Arc<DeferredLoad<TableMetadata>>,
 ) -> Arc<Mutex<PartitionData>>
 where
    T: PartitionProvider + 'static,
@ -248,13 +248,7 @@ where
    // (which would cause the connection to be returned).
    tokio::spawn(async move {
        inner
-            .get_partition(
-                partition_key,
-                namespace_id,
-                namespace_name,
-                table_id,
-                table_name,
-            )
+            .get_partition(partition_key, namespace_id, namespace_name, table_id, table)
            .await
    })
    .await
@ -280,7 +274,7 @@ mod tests {
    use crate::{
        buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState},
        test_util::{
-            defer_namespace_name_1_sec, defer_table_name_1_sec, PartitionDataBuilder,
+            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_ID,
        },
    };
@ -308,7 +302,7 @@ mod tests {
                    ARBITRARY_NAMESPACE_ID,
                    defer_namespace_name_1_sec(),
                    ARBITRARY_TABLE_ID,
-                    defer_table_name_1_sec(),
+                    defer_table_metadata_1_sec(),
                )
            })
            .collect::<FuturesUnordered<_>>()
@ -342,7 +336,7 @@ mod tests {
            _namespace_id: NamespaceId,
            _namespace_name: Arc<DeferredLoad<NamespaceName>>,
            _table_id: TableId,
-            _table_name: Arc<DeferredLoad<TableName>>,
+            _table: Arc<DeferredLoad<TableMetadata>>,
        ) -> core::pin::Pin<
            Box<
                dyn core::future::Future<Output = Arc<Mutex<PartitionData>>>
@ -368,7 +362,7 @@ mod tests {

        let data = PartitionDataBuilder::new().build();
        let namespace_loader = defer_namespace_name_1_sec();
-        let table_name_loader = defer_table_name_1_sec();
+        let table_loader = defer_table_metadata_1_sec();

        // Add a single instance of the partition - if more than one call is
        // made to the mock, it will panic.
@ -384,14 +378,14 @@ mod tests {
            ARBITRARY_NAMESPACE_ID,
            Arc::clone(&namespace_loader),
            ARBITRARY_TABLE_ID,
-            Arc::clone(&table_name_loader),
+            Arc::clone(&table_loader),
        );
        let pa_2 = layer.get_partition(
            ARBITRARY_PARTITION_KEY.clone(),
            ARBITRARY_NAMESPACE_ID,
            Arc::clone(&namespace_loader),
            ARBITRARY_TABLE_ID,
-            Arc::clone(&table_name_loader),
+            Arc::clone(&table_loader),
        );

        let waker = futures::task::noop_waker();
@ -411,7 +405,7 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                namespace_loader,
                ARBITRARY_TABLE_ID,
-                table_name_loader,
+                table_loader,
            )
            .with_timeout_panic(Duration::from_secs(5))
            .await;
@ -441,7 +435,7 @@ mod tests {
            _namespace_id: NamespaceId,
            _namespace_name: Arc<DeferredLoad<NamespaceName>>,
            _table_id: TableId,
-            _table_name: Arc<DeferredLoad<TableName>>,
+            _table: Arc<DeferredLoad<TableMetadata>>,
        ) -> Arc<Mutex<PartitionData>> {
            let waker = self.wait.notified();
            let permit = self.sem.acquire().await.unwrap();
@ -481,7 +475,7 @@ mod tests {
            ARBITRARY_NAMESPACE_ID,
            defer_namespace_name_1_sec(),
            ARBITRARY_TABLE_ID,
-            defer_table_name_1_sec(),
+            defer_table_metadata_1_sec(),
        );

        let waker = futures::task::noop_waker();
--- a/ingester/src/buffer_tree/partition/resolver/mock.rs
+++ b/ingester/src/buffer_tree/partition/resolver/mock.rs
@ -8,7 +8,7 @@ use parking_lot::Mutex;

 use super::r#trait::PartitionProvider;
 use crate::{
-    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
+    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableMetadata},
    deferred_load::{self, DeferredLoad},
 };

@ -53,7 +53,7 @@ impl PartitionProvider for MockPartitionProvider {
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        let p = self
            .partitions
@ -75,8 +75,8 @@ impl PartitionProvider for MockPartitionProvider {
            deferred_load::UNRESOLVED_DISPLAY_STRING,
        );

-        let actual_table_name = p.table_name().to_string();
-        let expected_table_name = table_name.get().await.to_string();
+        let actual_table_name = p.table().to_string();
+        let expected_table_name = table.get().await.name().to_string();
        assert!(
            (actual_table_name.as_str() == expected_table_name)
                || (actual_table_name == deferred_load::UNRESOLVED_DISPLAY_STRING),
--- a/ingester/src/buffer_tree/partition/resolver/trait.rs
+++ b/ingester/src/buffer_tree/partition/resolver/trait.rs
@ -5,7 +5,7 @@ use data_types::{NamespaceId, PartitionKey, TableId};
 use parking_lot::Mutex;

 use crate::{
-    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
+    buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableMetadata},
    deferred_load::DeferredLoad,
 };

@ -24,7 +24,7 @@ pub(crate) trait PartitionProvider: Send + Sync + Debug {
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>>;
 }

@ -39,16 +39,10 @@ where
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        table: Arc<DeferredLoad<TableMetadata>>,
    ) -> Arc<Mutex<PartitionData>> {
        (**self)
-            .get_partition(
-                partition_key,
-                namespace_id,
-                namespace_name,
-                table_id,
-                table_name,
-            )
+            .get_partition(partition_key, namespace_id, namespace_name, table_id, table)
            .await
    }
 }
@ -61,7 +55,7 @@ mod tests {
    use crate::{
        buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState},
        test_util::{
-            defer_namespace_name_1_sec, defer_table_name_1_sec, PartitionDataBuilder,
+            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_PARTITION_KEY,
            ARBITRARY_TABLE_ID,
        },
@ -70,10 +64,10 @@ mod tests {
    #[tokio::test]
    async fn test_arc_impl() {
        let namespace_loader = defer_namespace_name_1_sec();
-        let table_name_loader = defer_table_name_1_sec();
+        let table_loader = defer_table_metadata_1_sec();

        let data = PartitionDataBuilder::new()
-            .with_table_name_loader(Arc::clone(&table_name_loader))
+            .with_table_loader(Arc::clone(&table_loader))
            .with_namespace_loader(Arc::clone(&namespace_loader))
            .build();

@ -85,7 +79,7 @@ mod tests {
                ARBITRARY_NAMESPACE_ID,
                Arc::clone(&namespace_loader),
                ARBITRARY_TABLE_ID,
-                Arc::clone(&table_name_loader),
+                Arc::clone(&table_loader),
            )
            .await;
        assert_eq!(got.lock().partition_id(), ARBITRARY_PARTITION_ID);
@ -94,9 +88,6 @@ mod tests {
            got.lock().namespace_name().to_string(),
            namespace_loader.to_string()
        );
-        assert_eq!(
-            got.lock().table_name().to_string(),
-            table_name_loader.to_string()
-        );
+        assert_eq!(got.lock().table().to_string(), table_loader.to_string());
    }
 }
--- a/ingester/src/buffer_tree/root.rs
+++ b/ingester/src/buffer_tree/root.rs
@ -4,20 +4,24 @@ use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
 use metric::U64Counter;
 use parking_lot::Mutex;
+use predicate::Predicate;
 use trace::span::Span;

 use super::{
    namespace::{name_resolver::NamespaceNameProvider, NamespaceData},
    partition::{resolver::PartitionProvider, PartitionData},
    post_write::PostWriteObserver,
-    table::name_resolver::TableNameProvider,
+    table::metadata_resolver::TableProvider,
 };
 use crate::{
    arcmap::ArcMap,
    dml_payload::IngestOp,
    dml_sink::DmlSink,
    partition_iter::PartitionIter,
-    query::{response::QueryResponse, tracing::QueryExecTracing, QueryError, QueryExec},
+    query::{
+        projection::OwnedProjection, response::QueryResponse, tracing::QueryExecTracing,
+        QueryError, QueryExec,
+    },
 };

 /// A [`BufferTree`] is the root of an in-memory tree of many [`NamespaceData`]
@ -92,12 +96,12 @@ pub(crate) struct BufferTree<O> {
    /// [`NamespaceName`]: data_types::NamespaceName
    namespaces: ArcMap<NamespaceId, NamespaceData<O>>,
    namespace_name_resolver: Arc<dyn NamespaceNameProvider>,
-    /// The [`TableName`] provider used by [`NamespaceData`] to initialise a
+    /// The [`TableMetadata`] provider used by [`NamespaceData`] to initialise a
    /// [`TableData`].
    ///
-    /// [`TableName`]: crate::buffer_tree::table::TableName
+    /// [`TableMetadata`]: crate::buffer_tree::table::TableMetadata
    /// [`TableData`]: crate::buffer_tree::table::TableData
-    table_name_resolver: Arc<dyn TableNameProvider>,
+    table_resolver: Arc<dyn TableProvider>,

    metrics: Arc<metric::Registry>,
    namespace_count: U64Counter,
@ -112,7 +116,7 @@ where
    /// Initialise a new [`BufferTree`] that emits metrics to `metrics`.
    pub(crate) fn new(
        namespace_name_resolver: Arc<dyn NamespaceNameProvider>,
-        table_name_resolver: Arc<dyn TableNameProvider>,
+        table_resolver: Arc<dyn TableProvider>,
        partition_provider: Arc<dyn PartitionProvider>,
        post_write_observer: Arc<O>,
        metrics: Arc<metric::Registry>,
@ -127,7 +131,7 @@ where
        Self {
            namespaces: Default::default(),
            namespace_name_resolver,
-            table_name_resolver,
+            table_resolver,
            metrics,
            partition_provider,
            post_write_observer,
@ -178,7 +182,7 @@ where
            Arc::new(NamespaceData::new(
                namespace_id,
                Arc::new(self.namespace_name_resolver.for_namespace(namespace_id)),
-                Arc::clone(&self.table_name_resolver),
+                Arc::clone(&self.table_resolver),
                Arc::clone(&self.partition_provider),
                Arc::clone(&self.post_write_observer),
                &self.metrics,
@ -200,8 +204,9 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        // Extract the namespace if it exists.
        let inner = self
@ -211,7 +216,7 @@ where
        // Delegate query execution to the namespace, wrapping the execution in
        // a tracing delegate to emit a child span.
        QueryExecTracing::new(inner, "namespace")
-            .query_exec(namespace_id, table_id, columns, span)
+            .query_exec(namespace_id, table_id, projection, span, predicate)
            .await
    }
 }
@ -227,29 +232,41 @@ where

 #[cfg(test)]
 mod tests {
+    use std::{sync::Arc, time::Duration};
+
+    use arrow::datatypes::DataType;
+    use assert_matches::assert_matches;
+    use data_types::{
+        partition_template::{test_table_partition_override, TemplatePart},
+        PartitionId, PartitionKey,
+    };
+    use datafusion::{
+        assert_batches_eq, assert_batches_sorted_eq,
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+    use futures::StreamExt;
+    use lazy_static::lazy_static;
+    use metric::{Attributes, Metric};
+    use predicate::Predicate;
+    use test_helpers::maybe_start_logging;
+
    use super::*;
    use crate::{
        buffer_tree::{
            namespace::{name_resolver::mock::MockNamespaceNameProvider, NamespaceData},
            partition::resolver::mock::MockPartitionProvider,
            post_write::mock::MockPostWriteObserver,
-            table::TableName,
+            table::{metadata_resolver::mock::MockTableProvider, TableMetadata},
        },
        deferred_load::{self, DeferredLoad},
        query::partition_response::PartitionResponse,
        test_util::{
            defer_namespace_name_1_ms, make_write_op, PartitionDataBuilder, ARBITRARY_NAMESPACE_ID,
            ARBITRARY_NAMESPACE_NAME, ARBITRARY_PARTITION_ID, ARBITRARY_PARTITION_KEY,
-            ARBITRARY_TABLE_ID, ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_NAME_PROVIDER,
+            ARBITRARY_TABLE_ID, ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_PROVIDER,
        },
    };
-    use assert_matches::assert_matches;
-    use data_types::{PartitionId, PartitionKey};
-    use datafusion::{assert_batches_eq, assert_batches_sorted_eq};
-    use futures::StreamExt;
-    use lazy_static::lazy_static;
-    use metric::{Attributes, Metric};
-    use std::{sync::Arc, time::Duration};

    const PARTITION2_ID: PartitionId = PartitionId::new(2);
    const PARTITION3_ID: PartitionId = PartitionId::new(3);
@ -278,7 +295,7 @@ mod tests {
        let ns = NamespaceData::new(
            ARBITRARY_NAMESPACE_ID,
            defer_namespace_name_1_ms(),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            &metrics,
@ -337,13 +354,19 @@ mod tests {
    macro_rules! test_write_query {
        (
            $name:ident,
-            partitions = [$($partition:expr), +], // The set of PartitionData for the mock partition provider
+            $(table_provider = $table_provider:expr,)? // An optional table provider
+            partitions = [$($partition:expr), +], // The set of PartitionData for the mock
+                                                  // partition provider
            writes = [$($write:expr), *],         // The set of WriteOperation to apply()
-            want = $want:expr                     // The expected results of querying ARBITRARY_NAMESPACE_ID and ARBITRARY_TABLE_ID
+            predicate = $predicate:expr,          // An optional predicate to use for the query
+            want = $want:expr                     // The expected results of querying
+                                                  // ARBITRARY_NAMESPACE_ID and ARBITRARY_TABLE_ID
        ) => {
            paste::paste! {
                #[tokio::test]
                async fn [<test_write_query_ $name>]() {
+                    maybe_start_logging();
+
                    // Configure the mock partition provider with the provided
                    // partitions.
                    let partition_provider = Arc::new(MockPartitionProvider::default()
@ -352,10 +375,16 @@ mod tests {
                        )+
                    );

+                    #[allow(unused_variables)]
+                    let table_provider = Arc::clone(&*ARBITRARY_TABLE_PROVIDER);
+                    $(
+                        let table_provider: Arc<dyn TableProvider> = $table_provider;
+                    )?
+
                    // Init the buffer tree
                    let buf = BufferTree::new(
                        Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-                        Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+                        table_provider,
                        partition_provider,
                        Arc::new(MockPostWriteObserver::default()),
                        Arc::new(metric::Registry::default()),
@ -370,7 +399,13 @@ mod tests {

                    // Execute the query against ARBITRARY_NAMESPACE_ID and ARBITRARY_TABLE_ID
                    let batches = buf
-                        .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+                        .query_exec(
+                            ARBITRARY_NAMESPACE_ID,
+                            ARBITRARY_TABLE_ID,
+                            OwnedProjection::default(),
+                            None,
+                            $predicate
+                        )
                        .await
                        .expect("query should succeed")
                        .into_partition_stream()
@ -407,6 +442,7 @@ mod tests {
            ),
            None,
        )],
+        predicate = None,
        want = [
            "+----------+------+-------------------------------+",
            "| region   | temp | time                          |",
@ -456,6 +492,7 @@ mod tests {
                None,
            )
        ],
+        predicate = None,
        want = [
            "+----------+------+-------------------------------+",
            "| region   | temp | time                          |",
@ -508,6 +545,7 @@ mod tests {
                None,
            )
        ],
+        predicate = None,
        want = [
            "+--------+------+-------------------------------+",
            "| region | temp | time                          |",
@ -520,7 +558,7 @@ mod tests {
    // A query that ensures the data across multiple tables (with the same table
    // name!) is correctly filtered to return only the queried table.
    test_write_query!(
-        filter_multiple_tabls,
+        filter_multiple_tables,
        partitions = [
            PartitionDataBuilder::new()
                .with_partition_id(ARBITRARY_PARTITION_ID)
@ -558,6 +596,7 @@ mod tests {
                None,
            )
        ],
+        predicate = None,
        want = [
            "+--------+------+-------------------------------+",
            "| region | temp | time                          |",
@ -603,6 +642,7 @@ mod tests {
                None,
            )
        ],
+        predicate = None,
        want = [
            "+----------+------+-------------------------------+",
            "| region   | temp | time                          |",
@ -613,6 +653,98 @@ mod tests {
        ]
    );

+    // This test asserts that the results returned from a query to the
+    // [`BufferTree`] filters rows from the result as directed by the
+    // query's [`Predicate`].
+    //
+    // It makes sure that for a [`BufferTree`] with a set of partitions split
+    // by some key a query with a predicate `<partition key column> == <arbitrary literal>`
+    // returns partition data that has been filtered to contain only rows which
+    // contain the specified value in that partition key column.
+    test_write_query!(
+        filter_by_predicate_partition_key,
+        table_provider = Arc::new(MockTableProvider::new(TableMetadata::new_for_testing(
+            ARBITRARY_TABLE_NAME.clone(),
+            test_table_partition_override(vec![TemplatePart::TagValue("region")])
+        ))),
+        partitions = [
+            PartitionDataBuilder::new()
+                .with_partition_id(ARBITRARY_PARTITION_ID)
+                .with_partition_key(ARBITRARY_PARTITION_KEY.clone()) // "platanos"
+                .build(),
+            PartitionDataBuilder::new()
+                .with_partition_id(PARTITION2_ID)
+                .with_partition_key(PARTITION2_KEY.clone()) // "p2"
+                .build()
+        ],
+        writes = [
+            make_write_op(
+                &ARBITRARY_PARTITION_KEY,
+                ARBITRARY_NAMESPACE_ID,
+                &ARBITRARY_TABLE_NAME,
+                ARBITRARY_TABLE_ID,
+                0,
+                &format!(
+                    r#"{},region={} temp=35 4242424242"#,
+                    &*ARBITRARY_TABLE_NAME, &*ARBITRARY_PARTITION_KEY
+                ),
+                None,
+            ),
+            make_write_op(
+                &ARBITRARY_PARTITION_KEY,
+                ARBITRARY_NAMESPACE_ID,
+                &ARBITRARY_TABLE_NAME,
+                ARBITRARY_TABLE_ID,
+                1,
+                &format!(
+                    r#"{},region={} temp=12 4242424242"#,
+                    &*ARBITRARY_TABLE_NAME, &*ARBITRARY_PARTITION_KEY
+                ),
+                None,
+            ),
+            make_write_op(
+                &PARTITION2_KEY,
+                ARBITRARY_NAMESPACE_ID,
+                &ARBITRARY_TABLE_NAME,
+                ARBITRARY_TABLE_ID,
+                2,
+                &format!(
+                    r#"{},region={} temp=17 7676767676"#,
+                    &*ARBITRARY_TABLE_NAME, *PARTITION2_KEY
+                ),
+                None,
+            ),
+            make_write_op(
+                &PARTITION2_KEY,
+                ARBITRARY_NAMESPACE_ID,
+                &ARBITRARY_TABLE_NAME,
+                ARBITRARY_TABLE_ID,
+                3,
+                &format!(
+                    r#"{},region={} temp=13 7676767676"#,
+                    &*ARBITRARY_TABLE_NAME, *PARTITION2_KEY,
+                ),
+                None,
+            )
+        ],
+        // NOTE: The querier will coerce the type of the predicates correctly, so the ingester does NOT need to perform
+        //       type coercion. This type should reflect that.
+        predicate = Some(Predicate::new().with_expr(col("region").eq(lit(
+            ScalarValue::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(ScalarValue::from(PARTITION2_KEY.inner()))
+            )
+        )))),
+        want = [
+            "+--------+------+-------------------------------+",
+            "| region | temp | time                          |",
+            "+--------+------+-------------------------------+",
+            "| p2     | 13.0 | 1970-01-01T00:00:07.676767676 |",
+            "| p2     | 17.0 | 1970-01-01T00:00:07.676767676 |",
+            "+--------+------+-------------------------------+",
+        ]
+    );
+
    /// Assert that multiple writes to a single namespace/table results in a
    /// single namespace being created, and matching metrics.
    #[tokio::test]
@ -627,7 +759,7 @@ mod tests {
                )
                .with_partition(
                    PartitionDataBuilder::new()
-                        .with_partition_id(ARBITRARY_PARTITION_ID)
+                        .with_partition_id(PARTITION2_ID)
                        .with_partition_key(PARTITION2_KEY.clone())
                        .build(),
                ),
@ -638,7 +770,7 @@ mod tests {
        // Init the buffer tree
        let buf = BufferTree::new(
            Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            Arc::clone(&metrics),
@ -722,9 +854,14 @@ mod tests {
                        .with_partition_id(PARTITION3_ID)
                        .with_partition_key(PARTITION3_KEY.clone())
                        .with_table_id(TABLE2_ID)
-                        .with_table_name_loader(Arc::new(DeferredLoad::new(
+                        .with_table_loader(Arc::new(DeferredLoad::new(
                            Duration::from_secs(1),
-                            async move { TableName::from(TABLE2_NAME) },
+                            async move {
+                                TableMetadata::new_for_testing(
+                                    TABLE2_NAME.into(),
+                                    Default::default(),
+                                )
+                            },
                            &metric::Registry::default(),
                        )))
                        .build(),
@ -734,7 +871,7 @@ mod tests {
        // Init the buffer tree
        let buf = BufferTree::new(
            Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            Arc::clone(&Arc::new(metric::Registry::default())),
@ -821,7 +958,7 @@ mod tests {
        // Init the BufferTree
        let buf = BufferTree::new(
            Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            Arc::new(metric::Registry::default()),
@ -829,7 +966,13 @@ mod tests {

        // Query the empty tree
        let err = buf
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect_err("query should fail");
        assert_matches!(err, QueryError::NamespaceNotFound(ns) => {
@ -854,7 +997,13 @@ mod tests {

        // Ensure an unknown table errors
        let err = buf
-            .query_exec(ARBITRARY_NAMESPACE_ID, TABLE2_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                TABLE2_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect_err("query should fail");
        assert_matches!(err, QueryError::TableNotFound(ns, t) => {
@ -863,9 +1012,15 @@ mod tests {
        });

        // Ensure a valid namespace / table does not error
-        buf.query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
-            .await
-            .expect("namespace / table should exist");
+        buf.query_exec(
+            ARBITRARY_NAMESPACE_ID,
+            ARBITRARY_TABLE_ID,
+            OwnedProjection::default(),
+            None,
+            None,
+        )
+        .await
+        .expect("namespace / table should exist");
    }

    /// This test asserts the read consistency properties defined in the
@ -906,7 +1061,7 @@ mod tests {
        // Init the buffer tree
        let buf = BufferTree::new(
            Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            partition_provider,
            Arc::new(MockPostWriteObserver::default()),
            Arc::new(metric::Registry::default()),
@ -931,7 +1086,13 @@ mod tests {
        // Execute a query of the buffer tree, generating the result stream, but
        // DO NOT consume it.
        let stream = buf
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect("query should succeed")
            .into_partition_stream();
--- a/ingester/src/buffer_tree/table.rs
+++ b/ingester/src/buffer_tree/table.rs
@ -1,14 +1,23 @@
 //! Table level data buffer structures.

-pub(crate) mod name_resolver;
+pub(crate) mod metadata_resolver;

-use std::{fmt::Debug, sync::Arc};
+use std::{collections::HashMap, fmt::Debug, sync::Arc};

 use async_trait::async_trait;
-use data_types::{NamespaceId, PartitionKey, SequenceNumber, TableId};
+use data_types::{
+    partition_template::{build_column_values, ColumnValue, TablePartitionTemplateOverride},
+    NamespaceId, PartitionKey, SequenceNumber, Table, TableId,
+};
+use datafusion::scalar::ScalarValue;
+use iox_query::{
+    chunk_statistics::{create_chunk_statistics, ColumnRange},
+    pruning::prune_summaries,
+    QueryChunk,
+};
 use mutable_batch::MutableBatch;
 use parking_lot::Mutex;
-use schema::Projection;
+use predicate::Predicate;
 use trace::span::{Span, SpanRecorder};

 use super::{
@ -20,10 +29,55 @@ use crate::{
    arcmap::ArcMap,
    deferred_load::DeferredLoad,
    query::{
-        partition_response::PartitionResponse, response::PartitionStream, QueryError, QueryExec,
+        partition_response::PartitionResponse, projection::OwnedProjection,
+        response::PartitionStream, QueryError, QueryExec,
    },
+    query_adaptor::QueryAdaptor,
 };

+/// Metadata from the catalog for a table
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct TableMetadata {
+    name: TableName,
+    partition_template: TablePartitionTemplateOverride,
+}
+
+impl TableMetadata {
+    #[cfg(test)]
+    pub fn new_for_testing(
+        name: TableName,
+        partition_template: TablePartitionTemplateOverride,
+    ) -> Self {
+        Self {
+            name,
+            partition_template,
+        }
+    }
+
+    pub(crate) fn name(&self) -> &TableName {
+        &self.name
+    }
+
+    pub(crate) fn partition_template(&self) -> &TablePartitionTemplateOverride {
+        &self.partition_template
+    }
+}
+
+impl From<Table> for TableMetadata {
+    fn from(t: Table) -> Self {
+        Self {
+            name: t.name.into(),
+            partition_template: t.partition_template,
+        }
+    }
+}
+
+impl std::fmt::Display for TableMetadata {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Display::fmt(&self.name, f)
+    }
+}
+
 /// The string name / identifier of a Table.
 ///
 /// A reference-counted, cheap clone-able string.
@ -69,7 +123,7 @@ impl PartialEq<str> for TableName {
 #[derive(Debug)]
 pub(crate) struct TableData<O> {
    table_id: TableId,
-    table_name: Arc<DeferredLoad<TableName>>,
+    catalog_table: Arc<DeferredLoad<TableMetadata>>,

    /// The catalog ID of the namespace this table is being populated from.
    namespace_id: NamespaceId,
@ -93,7 +147,7 @@ impl<O> TableData<O> {
    /// for the first time.
    pub(super) fn new(
        table_id: TableId,
-        table_name: Arc<DeferredLoad<TableName>>,
+        catalog_table: Arc<DeferredLoad<TableMetadata>>,
        namespace_id: NamespaceId,
        namespace_name: Arc<DeferredLoad<NamespaceName>>,
        partition_provider: Arc<dyn PartitionProvider>,
@ -101,7 +155,7 @@ impl<O> TableData<O> {
    ) -> Self {
        Self {
            table_id,
-            table_name,
+            catalog_table,
            namespace_id,
            namespace_name,
            partition_data: Default::default(),
@ -132,9 +186,9 @@ impl<O> TableData<O> {
        self.table_id
    }

-    /// Returns the name of this table.
-    pub(crate) fn table_name(&self) -> &Arc<DeferredLoad<TableName>> {
-        &self.table_name
+    /// Returns the catalog data for this table.
+    pub(crate) fn catalog_table(&self) -> &Arc<DeferredLoad<TableMetadata>> {
+        &self.catalog_table
    }

    /// Return the [`NamespaceId`] this table is a part of.
@ -166,7 +220,7 @@ where
                        self.namespace_id,
                        Arc::clone(&self.namespace_name),
                        self.table_id,
-                        Arc::clone(&self.table_name),
+                        Arc::clone(&self.catalog_table),
                    )
                    .await;
                // Add the partition to the map.
@ -202,8 +256,9 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        assert_eq!(self.table_id, table_id, "buffer tree index inconsistency");
        assert_eq!(
@ -211,18 +266,21 @@ where
            "buffer tree index inconsistency"
        );

+        let table_partition_template = self.catalog_table.get().await.partition_template;
+
        // Gather the partition data from all of the partitions in this table.
        let span = SpanRecorder::new(span);
        let partitions = self.partitions().into_iter().map(move |p| {
            let mut span = span.child("partition read");

-            let (id, hash_id, completed_persistence_count, data) = {
+            let (id, hash_id, completed_persistence_count, data, partition_key) = {
                let mut p = p.lock();
                (
                    p.partition_id(),
                    p.partition_hash_id().cloned(),
                    p.completed_persistence_count(),
-                    p.get_query_data(),
+                    p.get_query_data(&projection),
+                    p.partition_key().clone(),
                )
            };

@ -230,16 +288,36 @@ where
                Some(data) => {
                    assert_eq!(id, data.partition_id());

-                    // Project the data if necessary
-                    let columns = columns.iter().map(String::as_str).collect::<Vec<_>>();
-                    let selection = if columns.is_empty() {
-                        Projection::All
-                    } else {
-                        Projection::Some(columns.as_ref())
-                    };
+                    // Potentially prune out this partition if the partition
+                    // template & derived partition key can be used to match
+                    // against the optional predicate.
+                    if predicate
+                        .as_ref()
+                        .map(|p| {
+                            !keep_after_pruning_partition_key(
+                                &table_partition_template,
+                                &partition_key,
+                                p,
+                                &data,
+                            )
+                        })
+                        .unwrap_or_default()
+                    {
+                        return PartitionResponse::new(
+                            vec![],
+                            id,
+                            hash_id,
+                            completed_persistence_count,
+                        );
+                    }

-                    let data = data.project_selection(selection).into_iter().collect();
-                    PartitionResponse::new(data, id, hash_id, completed_persistence_count)
+                    // Project the data if necessary
+                    PartitionResponse::new(
+                        data.into_record_batches(),
+                        id,
+                        hash_id,
+                        completed_persistence_count,
+                    )
                }
                None => PartitionResponse::new(vec![], id, hash_id, completed_persistence_count),
            };
@ -252,6 +330,106 @@ where
    }
 }

+/// Return true if `data` contains one or more rows matching `predicate`,
+/// pruning based on the `partition_key` and `template`.
+///
+/// Returns false iff it can be proven that all of data does not match the
+/// predicate.
+fn keep_after_pruning_partition_key(
+    table_partition_template: &TablePartitionTemplateOverride,
+    partition_key: &PartitionKey,
+    predicate: &Predicate,
+    data: &QueryAdaptor,
+) -> bool {
+    // Construct a set of per-column min/max statistics based on the partition
+    // key values.
+    let column_ranges = Arc::new(
+        build_column_values(table_partition_template, partition_key.inner())
+            .filter_map(|(col, val)| {
+                let range = match val {
+                    ColumnValue::Identity(s) => {
+                        let s = Arc::new(ScalarValue::from(s.as_ref()));
+                        ColumnRange {
+                            min_value: Arc::clone(&s),
+                            max_value: s,
+                        }
+                    }
+                    ColumnValue::Prefix(p) if p.is_empty() => return None,
+                    ColumnValue::Prefix(p) => {
+                        // If the partition only has a prefix of the tag value
+                        // (it was truncated) then form a conservative range:
+                        //
+                        // # Minimum
+                        // Use the prefix itself.
+                        //
+                        // Note that the minimum is inclusive.
+                        //
+                        // All values in the partition are either:
+                        //
+                        // - identical to the prefix, in which case they are
+                        //   included by the inclusive minimum
+                        //
+                        // - have the form `"<prefix><s>"`, and it holds that
+                        //   `"<prefix><s>" > "<prefix>"` for all strings
+                        //   `"<s>"`.
+                        //
+                        // # Maximum
+                        // Use `"<prefix_excluding_last_char><char::max>"`.
+                        //
+                        // Note that the maximum is inclusive.
+                        //
+                        // All strings in this partition must be smaller than
+                        // this constructed maximum, because string comparison
+                        // is front-to-back and the
+                        // `"<prefix_excluding_last_char><char::max>" >
+                        // "<prefix>"`.
+
+                        let min_value = Arc::new(ScalarValue::from(p.as_ref()));
+
+                        let mut chars = p.as_ref().chars().collect::<Vec<_>>();
+                        *chars.last_mut().expect("checked that prefix is not empty") =
+                            std::char::MAX;
+                        let max_value = Arc::new(ScalarValue::from(
+                            chars.into_iter().collect::<String>().as_str(),
+                        ));
+
+                        ColumnRange {
+                            min_value,
+                            max_value,
+                        }
+                    }
+                };
+
+                Some((Arc::from(col), range))
+            })
+            .collect::<HashMap<_, _>>(),
+    );
+
+    let chunk_statistics = Arc::new(create_chunk_statistics(
+        data.num_rows(),
+        data.schema(),
+        data.ts_min_max(),
+        &column_ranges,
+    ));
+
+    prune_summaries(
+        data.schema(),
+        &[(chunk_statistics, data.schema().as_arrow())],
+        predicate,
+    )
+    // Errors are logged by `iox_query` and sometimes fine, e.g. for not
+    // implemented DataFusion features or upstream bugs. The querier uses the
+    // same strategy. Pruning is a mere optimization and should not lead to
+    // crashes or unreadable data.
+    .ok()
+    .map(|vals| {
+        vals.into_iter()
+            .next()
+            .expect("one chunk in, one chunk out")
+    })
+    .unwrap_or(true)
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
@ -265,7 +443,7 @@ mod tests {
            post_write::mock::MockPostWriteObserver,
        },
        test_util::{
-            defer_namespace_name_1_sec, defer_table_name_1_sec, PartitionDataBuilder,
+            defer_namespace_name_1_sec, defer_table_metadata_1_sec, PartitionDataBuilder,
            ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_ID,
            ARBITRARY_TABLE_NAME,
        },
@ -280,7 +458,7 @@ mod tests {

        let table = TableData::new(
            ARBITRARY_TABLE_ID,
-            defer_table_name_1_sec(),
+            defer_table_metadata_1_sec(),
            ARBITRARY_NAMESPACE_ID,
            defer_namespace_name_1_sec(),
            partition_provider,
--- a/ingester/src/buffer_tree/table/metadata_resolver.rs
+++ b/ingester/src/buffer_tree/table/metadata_resolver.rs
@ -4,24 +4,24 @@ use backoff::{Backoff, BackoffConfig};
 use data_types::TableId;
 use iox_catalog::interface::Catalog;

-use super::TableName;
+use super::TableMetadata;
 use crate::deferred_load::DeferredLoad;

 /// An abstract provider of a [`DeferredLoad`] configured to fetch the
-/// [`TableName`] of the specified [`TableId`].
-pub(crate) trait TableNameProvider: Send + Sync + std::fmt::Debug {
-    fn for_table(&self, id: TableId) -> DeferredLoad<TableName>;
+/// catalog [`TableMetadata`] of the specified [`TableId`].
+pub(crate) trait TableProvider: Send + Sync + std::fmt::Debug {
+    fn for_table(&self, id: TableId) -> DeferredLoad<TableMetadata>;
 }

 #[derive(Debug)]
-pub(crate) struct TableNameResolver {
+pub(crate) struct TableResolver {
    max_smear: Duration,
    catalog: Arc<dyn Catalog>,
    backoff_config: BackoffConfig,
    metrics: Arc<metric::Registry>,
 }

-impl TableNameResolver {
+impl TableResolver {
    pub(crate) fn new(
        max_smear: Duration,
        catalog: Arc<dyn Catalog>,
@ -36,16 +36,16 @@ impl TableNameResolver {
        }
    }

-    /// Fetch the [`TableName`] from the [`Catalog`] for specified
+    /// Fetch the [`TableMetadata`] from the [`Catalog`] for specified
    /// `table_id`, retrying endlessly when errors occur.
    pub(crate) async fn fetch(
        table_id: TableId,
        catalog: Arc<dyn Catalog>,
        backoff_config: BackoffConfig,
-    ) -> TableName {
+    ) -> TableMetadata {
        Backoff::new(&backoff_config)
-            .retry_all_errors("fetch table name", || async {
-                let s = catalog
+            .retry_all_errors("fetch table", || async {
+                let table = catalog
                    .repositories()
                    .await
                    .tables()
@ -54,18 +54,17 @@ impl TableNameResolver {
                    .unwrap_or_else(|| {
                        panic!("resolving table name for non-existent table id {table_id}")
                    })
-                    .name
                    .into();

-                Result::<_, iox_catalog::interface::Error>::Ok(s)
+                Result::<_, iox_catalog::interface::Error>::Ok(table)
            })
            .await
            .expect("retry forever")
    }
 }

-impl TableNameProvider for TableNameResolver {
-    fn for_table(&self, id: TableId) -> DeferredLoad<TableName> {
+impl TableProvider for TableResolver {
+    fn for_table(&self, id: TableId) -> DeferredLoad<TableMetadata> {
        DeferredLoad::new(
            self.max_smear,
            Self::fetch(id, Arc::clone(&self.catalog), self.backoff_config.clone()),
@ -79,28 +78,33 @@ pub(crate) mod mock {
    use super::*;

    #[derive(Debug)]
-    pub(crate) struct MockTableNameProvider {
-        name: TableName,
+    pub(crate) struct MockTableProvider {
+        table: TableMetadata,
    }

-    impl MockTableNameProvider {
-        pub(crate) fn new(name: impl Into<TableName>) -> Self {
-            Self { name: name.into() }
+    impl MockTableProvider {
+        pub(crate) fn new(table: impl Into<TableMetadata>) -> Self {
+            Self {
+                table: table.into(),
+            }
        }
    }

-    impl Default for MockTableNameProvider {
+    impl Default for MockTableProvider {
        fn default() -> Self {
-            Self::new("bananas")
+            Self::new(TableMetadata::new_for_testing(
+                "bananas".into(),
+                Default::default(),
+            ))
        }
    }

-    impl TableNameProvider for MockTableNameProvider {
-        fn for_table(&self, _id: TableId) -> DeferredLoad<TableName> {
-            let name = self.name.clone();
+    impl TableProvider for MockTableProvider {
+        fn for_table(&self, _id: TableId) -> DeferredLoad<TableMetadata> {
+            let table = self.table.clone();
            DeferredLoad::new(
                Duration::from_secs(1),
-                async { name },
+                async { table },
                &metric::Registry::default(),
            )
        }
@ -129,7 +133,7 @@ mod tests {
        // Populate the catalog with the namespace / table
        let (_ns_id, table_id) = populate_catalog(&*catalog, NAMESPACE_NAME, TABLE_NAME).await;

-        let fetcher = Arc::new(TableNameResolver::new(
+        let fetcher = Arc::new(TableResolver::new(
            Duration::from_secs(10),
            Arc::clone(&catalog),
            backoff_config.clone(),
@ -141,6 +145,6 @@ mod tests {
            .get()
            .with_timeout_panic(Duration::from_secs(5))
            .await;
-        assert_eq!(&**got, TABLE_NAME);
+        assert_eq!(got.name(), TABLE_NAME);
    }
 }
--- a/ingester/src/init.rs
+++ b/ingester/src/init.rs
@ -30,7 +30,7 @@ use crate::{
        partition::resolver::{
            CatalogPartitionResolver, CoalescePartitionResolver, PartitionCache, PartitionProvider,
        },
-        table::name_resolver::{TableNameProvider, TableNameResolver},
+        table::metadata_resolver::{TableProvider, TableResolver},
        BufferTree,
    },
    dml_sink::{instrumentation::DmlSinkInstrumentation, tracing::DmlSinkTracing},
@ -253,8 +253,8 @@ where
            Arc::clone(&metrics),
        ));

-    // Initialise the deferred table name resolver.
-    let table_name_provider: Arc<dyn TableNameProvider> = Arc::new(TableNameResolver::new(
+    // Initialise the deferred table metadata resolver.
+    let table_provider: Arc<dyn TableProvider> = Arc::new(TableResolver::new(
        persist_background_fetch_time,
        Arc::clone(&catalog),
        BackoffConfig::default(),
@ -326,7 +326,7 @@ where

    let buffer = Arc::new(BufferTree::new(
        namespace_name_provider,
-        table_name_provider,
+        table_provider,
        partition_provider,
        Arc::new(hot_partition_persister),
        Arc::clone(&metrics),
@ -389,9 +389,7 @@ where
    // ingester, but they are only used for internal ordering of operations at
    // runtime.
    let timestamp = Arc::new(TimestampOracle::new(
-        max_sequence_number
-            .map(|v| u64::try_from(v.get()).expect("sequence number overflow"))
-            .unwrap_or(0),
+        max_sequence_number.map(|v| v.get()).unwrap_or(0),
    ));

    let (shutdown_tx, shutdown_rx) = oneshot::channel();
--- a/ingester/src/init/graceful_shutdown.rs
+++ b/ingester/src/init/graceful_shutdown.rs
@ -9,6 +9,7 @@ use crate::{
    ingest_state::{IngestState, IngestStateError},
    partition_iter::PartitionIter,
    persist::{drain_buffer::persist_partitions, queue::PersistQueue},
+    query::projection::OwnedProjection,
 };

 /// Defines how often the shutdown task polls the partition buffers for
@ -77,10 +78,11 @@ pub(super) async fn graceful_shutdown_handler<F, T, P>(
    // springs to life and buffers in the buffer tree after this check has
    // completed - I think this is extreme enough to accept as a theoretical
    // possibility that doesn't need covering off in practice.
-    while buffer
-        .partition_iter()
-        .any(|p| p.lock().get_query_data().is_some())
-    {
+    while buffer.partition_iter().any(|p| {
+        p.lock()
+            .get_query_data(&OwnedProjection::default())
+            .is_some()
+    }) {
        if persist_partitions(buffer.partition_iter(), &persist).await != 0 {
            // Late arriving writes needed persisting.
            debug!("re-persisting late arriving data");
--- a/ingester/src/init/wal_replay.rs
+++ b/ingester/src/init/wal_replay.rs
@ -199,9 +199,7 @@ where
                op,
            } = op;

-            let sequence_number = SequenceNumber::new(
-                i64::try_from(sequence_number).expect("sequence number overflow"),
-            );
+            let sequence_number = SequenceNumber::new(sequence_number);

            max_sequence = max_sequence.max(Some(sequence_number));

--- a/ingester/src/persist/compact.rs
+++ b/ingester/src/persist/compact.rs
@ -67,10 +67,7 @@ pub(super) async fn compact_persisting_batch(
            adjust_sort_key_columns(&sk, &batch.schema().primary_key())
        }
        None => {
-            let sort_key = compute_sort_key(
-                batch.schema(),
-                batch.record_batches().iter().map(|sb| sb.as_ref()),
-            );
+            let sort_key = compute_sort_key(batch.schema(), batch.record_batches().iter());
            // Use the sort key computed from the cardinality as the sort key for this parquet
            // file's metadata, also return the sort key to be stored in the catalog
            (sort_key.clone(), Some(sort_key))
@ -127,7 +124,7 @@ mod tests {
            .to_arrow(Projection::All)
            .unwrap();

-        let batch = QueryAdaptor::new(ARBITRARY_PARTITION_ID, vec![Arc::new(batch)]);
+        let batch = QueryAdaptor::new(ARBITRARY_PARTITION_ID, vec![batch]);

        // verify PK
        let schema = batch.schema();
@ -459,8 +456,7 @@ mod tests {
        let expected_pk = vec!["tag1", "time"];
        assert_eq!(expected_pk, pk);

-        let sort_key =
-            compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
+        let sort_key = compute_sort_key(schema, batch.record_batches().iter());
        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));

        // compact
@ -500,8 +496,7 @@ mod tests {
        let expected_pk = vec!["tag1", "time"];
        assert_eq!(expected_pk, pk);

-        let sort_key =
-            compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
+        let sort_key = compute_sort_key(schema, batch.record_batches().iter());
        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));

        // compact
@ -549,8 +544,7 @@ mod tests {
        let expected_pk = vec!["tag1", "time"];
        assert_eq!(expected_pk, pk);

-        let sort_key =
-            compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
+        let sort_key = compute_sort_key(schema, batch.record_batches().iter());
        assert_eq!(sort_key, SortKey::from_columns(["tag1", "time"]));

        // compact
@ -596,8 +590,7 @@ mod tests {
        let expected_pk = vec!["tag1", "tag2", "time"];
        assert_eq!(expected_pk, pk);

-        let sort_key =
-            compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
+        let sort_key = compute_sort_key(schema, batch.record_batches().iter());
        assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));

        // compact
@ -647,8 +640,7 @@ mod tests {
        let expected_pk = vec!["tag1", "tag2", "time"];
        assert_eq!(expected_pk, pk);

-        let sort_key =
-            compute_sort_key(schema, batch.record_batches().iter().map(|rb| rb.as_ref()));
+        let sort_key = compute_sort_key(schema, batch.record_batches().iter());
        assert_eq!(sort_key, SortKey::from_columns(["tag1", "tag2", "time"]));

        // compact
@ -699,7 +691,7 @@ mod tests {
        batch.schema();
    }

-    async fn create_one_row_record_batch_with_influxtype() -> Vec<Arc<RecordBatch>> {
+    async fn create_one_row_record_batch_with_influxtype() -> Vec<RecordBatch> {
        let chunk1 = Arc::new(
            TestChunk::new("t")
                .with_id(1)
@ -723,11 +715,10 @@ mod tests {
        ];
        assert_batches_eq!(&expected, &batches);

-        let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
        batches
    }

-    async fn create_one_record_batch_with_influxtype_no_duplicates() -> Vec<Arc<RecordBatch>> {
+    async fn create_one_record_batch_with_influxtype_no_duplicates() -> Vec<RecordBatch> {
        let chunk1 = Arc::new(
            TestChunk::new("t")
                .with_id(1)
@ -753,11 +744,10 @@ mod tests {
        ];
        assert_batches_eq!(&expected, &batches);

-        let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
        batches
    }

-    async fn create_one_record_batch_with_influxtype_duplicates() -> Vec<Arc<RecordBatch>> {
+    async fn create_one_record_batch_with_influxtype_duplicates() -> Vec<RecordBatch> {
        let chunk1 = Arc::new(
            TestChunk::new("t")
                .with_id(1)
@ -790,12 +780,11 @@ mod tests {
        ];
        assert_batches_eq!(&expected, &batches);

-        let batches: Vec<_> = batches.iter().map(|r| Arc::new(r.clone())).collect();
        batches
    }

    /// RecordBatches with knowledge of influx metadata
-    async fn create_batches_with_influxtype() -> Vec<Arc<RecordBatch>> {
+    async fn create_batches_with_influxtype() -> Vec<RecordBatch> {
        // Use the available TestChunk to create chunks and then convert them to raw RecordBatches
        let mut batches = vec![];

@ -826,7 +815,7 @@ mod tests {
            "+-----------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch1.clone()]);
-        batches.push(Arc::new(batch1));
+        batches.push(batch1);

        // chunk2 having duplicate data with chunk 1
        let chunk2 = Arc::new(
@ -850,7 +839,7 @@ mod tests {
            "+-----------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch2.clone()]);
-        batches.push(Arc::new(batch2));
+        batches.push(batch2);

        // verify data from both batches
        let expected = vec![
@ -874,14 +863,13 @@ mod tests {
            "| 5         | MT   | 1970-01-01T00:00:00.000005Z    |",
            "+-----------+------+--------------------------------+",
        ];
-        let b: Vec<_> = batches.iter().map(|b| (**b).clone()).collect();
-        assert_batches_eq!(&expected, &b);
+        assert_batches_eq!(&expected, &batches);

        batches
    }

    /// RecordBatches with knowledge of influx metadata
-    async fn create_batches_with_influxtype_different_columns() -> Vec<Arc<RecordBatch>> {
+    async fn create_batches_with_influxtype_different_columns() -> Vec<RecordBatch> {
        // Use the available TestChunk to create chunks and then convert them to raw RecordBatches
        let mut batches = vec![];

@ -912,7 +900,7 @@ mod tests {
            "+-----------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch1.clone()]);
-        batches.push(Arc::new(batch1));
+        batches.push(batch1);

        // chunk2 having duplicate data with chunk 1
        // mmore columns
@ -939,14 +927,14 @@ mod tests {
            "+-----------+------------+------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch2.clone()]);
-        batches.push(Arc::new(batch2));
+        batches.push(batch2);

        batches
    }

    /// RecordBatches with knowledge of influx metadata
-    async fn create_batches_with_influxtype_different_columns_different_order(
-    ) -> Vec<Arc<RecordBatch>> {
+    async fn create_batches_with_influxtype_different_columns_different_order() -> Vec<RecordBatch>
+    {
        // Use the available TestChunk to create chunks and then convert them to raw RecordBatches
        let mut batches = vec![];

@ -978,7 +966,7 @@ mod tests {
            "+-----------+------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch1.clone()]);
-        batches.push(Arc::new(batch1.clone()));
+        batches.push(batch1.clone());

        // chunk2 having duplicate data with chunk 1
        // mmore columns
@ -1003,13 +991,13 @@ mod tests {
            "+-----------+------+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch2.clone()]);
-        batches.push(Arc::new(batch2));
+        batches.push(batch2);

        batches
    }

    /// Has 2 tag columns; tag1 has a lower cardinality (3) than tag3 (4)
-    async fn create_batches_with_influxtype_different_cardinality() -> Vec<Arc<RecordBatch>> {
+    async fn create_batches_with_influxtype_different_cardinality() -> Vec<RecordBatch> {
        // Use the available TestChunk to create chunks and then convert them to raw RecordBatches
        let mut batches = vec![];

@ -1034,7 +1022,7 @@ mod tests {
            "+-----------+------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch1.clone()]);
-        batches.push(Arc::new(batch1.clone()));
+        batches.push(batch1.clone());

        let chunk2 = Arc::new(
            TestChunk::new("t")
@ -1057,13 +1045,13 @@ mod tests {
            "+-----------+------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch2.clone()]);
-        batches.push(Arc::new(batch2));
+        batches.push(batch2);

        batches
    }

    /// RecordBatches with knowledge of influx metadata
-    async fn create_batches_with_influxtype_same_columns_different_type() -> Vec<Arc<RecordBatch>> {
+    async fn create_batches_with_influxtype_same_columns_different_type() -> Vec<RecordBatch> {
        // Use the available TestChunk to create chunks and then convert them to raw RecordBatches
        let mut batches = vec![];

@ -1087,7 +1075,7 @@ mod tests {
            "+-----------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch1.clone()]);
-        batches.push(Arc::new(batch1));
+        batches.push(batch1);

        // chunk2 having duplicate data with chunk 1
        // mmore columns
@ -1110,7 +1098,7 @@ mod tests {
            "+-----------+------+-----------------------------+",
        ];
        assert_batches_eq!(&expected, &[batch2.clone()]);
-        batches.push(Arc::new(batch2));
+        batches.push(batch2);

        batches
    }
--- a/ingester/src/persist/context.rs
+++ b/ingester/src/persist/context.rs
@ -18,7 +18,7 @@ use crate::{
    buffer_tree::{
        namespace::NamespaceName,
        partition::{persisting::PersistingData, PartitionData, SortKeyState},
-        table::TableName,
+        table::TableMetadata,
    },
    deferred_load::DeferredLoad,
    persist::completion_observer::CompletedPersist,
@ -94,14 +94,14 @@ pub(super) struct Context {
    // The partition key for this partition
    partition_key: PartitionKey,

-    /// Deferred strings needed for persistence.
+    /// Deferred data needed for persistence.
    ///
    /// These [`DeferredLoad`] are given a pre-fetch hint when this [`Context`]
    /// is constructed to load them in the background (if not already resolved)
    /// in order to avoid incurring the query latency when the values are
    /// needed.
    namespace_name: Arc<DeferredLoad<NamespaceName>>,
-    table_name: Arc<DeferredLoad<TableName>>,
+    table: Arc<DeferredLoad<TableMetadata>>,

    /// The [`SortKey`] for the [`PartitionData`] at the time of [`Context`]
    /// construction.
@ -164,7 +164,7 @@ impl Context {
                partition_hash_id: guard.partition_hash_id().cloned(),
                partition_key: guard.partition_key().clone(),
                namespace_name: Arc::clone(guard.namespace_name()),
-                table_name: Arc::clone(guard.table_name()),
+                table: Arc::clone(guard.table()),

                // Technically the sort key isn't immutable, but MUST NOT be
                // changed by an external actor (by something other than code in
@ -182,7 +182,7 @@ impl Context {
        // Pre-fetch the deferred values in a background thread (if not already
        // resolved)
        s.namespace_name.prefetch_now();
-        s.table_name.prefetch_now();
+        s.table.prefetch_now();
        if let SortKeyState::Deferred(ref d) = s.sort_key {
            d.prefetch_now();
        }
@ -253,7 +253,7 @@ impl Context {
            namespace_id = %self.namespace_id,
            namespace_name = %self.namespace_name,
            table_id = %self.table_id,
-            table_name = %self.table_name,
+            table = %self.table,
            partition_id = %self.partition_id,
            partition_key = %self.partition_key,
            total_persist_duration = ?now.duration_since(self.enqueued_at),
@ -315,7 +315,7 @@ impl Context {
        self.namespace_name.as_ref()
    }

-    pub(super) fn table_name(&self) -> &DeferredLoad<TableName> {
-        self.table_name.as_ref()
+    pub(super) fn table(&self) -> &DeferredLoad<TableMetadata> {
+        self.table.as_ref()
    }
 }
--- a/ingester/src/persist/handle.rs
+++ b/ingester/src/persist/handle.rs
@ -501,7 +501,7 @@ mod tests {
        test_util::{
            make_write_op, PartitionDataBuilder, ARBITRARY_NAMESPACE_ID, ARBITRARY_NAMESPACE_NAME,
            ARBITRARY_PARTITION_ID, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_ID,
-            ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_NAME_PROVIDER,
+            ARBITRARY_TABLE_NAME, ARBITRARY_TABLE_PROVIDER,
        },
    };

@ -510,7 +510,7 @@ mod tests {
    async fn new_partition(sort_key: SortKeyState) -> Arc<Mutex<PartitionData>> {
        let buffer_tree = BufferTree::new(
            Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME)),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            Arc::new(
                MockPartitionProvider::default().with_partition(
                    PartitionDataBuilder::new()
--- a/ingester/src/persist/hot_partitions.rs
+++ b/ingester/src/persist/hot_partitions.rs
@ -110,6 +110,7 @@ mod tests {

    use crate::{
        persist::queue::mock::MockPersistQueue,
+        query::projection::OwnedProjection,
        test_util::{PartitionDataBuilder, ARBITRARY_TABLE_NAME},
    };

@ -162,7 +163,9 @@ mod tests {
            guard
                .buffer_write(mb, SequenceNumber::new(2))
                .expect("write should succeed");
-            guard.get_query_data().expect("should have query adaptor")
+            guard
+                .get_query_data(&OwnedProjection::default())
+                .expect("should have query adaptor")
        };

        hot_partition_persister.observe(Arc::clone(&p), p.lock());
@ -170,7 +173,7 @@ mod tests {
        tokio::task::yield_now().await;
        // Assert the partition was queued for persistence with the correct data.
        assert_matches!(persist_handle.calls().as_slice(), [got] => {
-            let got_query_data = got.lock().get_query_data().expect("should have query adaptor");
+            let got_query_data = got.lock().get_query_data(&OwnedProjection::default(),).expect("should have query adaptor");
            assert_eq!(got_query_data.record_batches(), want_query_data.record_batches());
        });

--- a/ingester/src/persist/mod.rs
+++ b/ingester/src/persist/mod.rs
@ -48,7 +48,7 @@ mod tests {
        test_util::{
            make_write_op, populate_catalog, ARBITRARY_NAMESPACE_NAME,
            ARBITRARY_NAMESPACE_NAME_PROVIDER, ARBITRARY_PARTITION_KEY, ARBITRARY_TABLE_NAME,
-            ARBITRARY_TABLE_NAME_PROVIDER,
+            ARBITRARY_TABLE_PROVIDER,
        },
    };

@ -67,7 +67,7 @@ mod tests {
        // Init the buffer tree
        let buf = BufferTree::new(
            Arc::clone(&*ARBITRARY_NAMESPACE_NAME_PROVIDER),
-            Arc::clone(&*ARBITRARY_TABLE_NAME_PROVIDER),
+            Arc::clone(&*ARBITRARY_TABLE_PROVIDER),
            Arc::new(CatalogPartitionResolver::new(Arc::clone(&catalog))),
            Arc::new(MockPostWriteObserver::default()),
            Arc::new(metric::Registry::default()),
--- a/ingester/src/persist/worker.rs
+++ b/ingester/src/persist/worker.rs
@ -202,7 +202,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        ?sort_key,
@ -218,7 +218,7 @@ where
    compact_persisting_batch(
        &worker_state.exec,
        sort_key,
-        ctx.table_name().get().await,
+        ctx.table().get().await.name().clone(),
        ctx.data().query_adaptor(),
    )
    .await
@ -249,7 +249,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        %object_store_id,
@ -265,7 +265,7 @@ where
        namespace_id: ctx.namespace_id(),
        namespace_name: Arc::clone(&*ctx.namespace_name().get().await),
        table_id: ctx.table_id(),
-        table_name: Arc::clone(&*ctx.table_name().get().await),
+        table_name: Arc::clone(ctx.table().get().await.name()),
        partition_key: ctx.partition_key().clone(),
        compaction_level: CompactionLevel::Initial,
        sort_key: Some(data_sort_key),
@ -291,7 +291,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        %object_store_id,
@ -358,7 +358,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        ?new_sort_key,
@ -394,7 +394,7 @@ where
                            namespace_id = %ctx.namespace_id(),
                            namespace_name = %ctx.namespace_name(),
                            table_id = %ctx.table_id(),
-                            table_name = %ctx.table_name(),
+                            table = %ctx.table(),
                            partition_id = %ctx.partition_id(),
                            partition_key = %ctx.partition_key(),
                            expected=?old_sort_key,
@ -420,7 +420,7 @@ where
                            namespace_id = %ctx.namespace_id(),
                            namespace_name = %ctx.namespace_name(),
                            table_id = %ctx.table_id(),
-                            table_name = %ctx.table_name(),
+                            table = %ctx.table(),
                            partition_id = %ctx.partition_id(),
                            partition_key = %ctx.partition_key(),
                            expected=?old_sort_key,
@ -460,7 +460,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        ?old_sort_key,
@ -488,7 +488,7 @@ where
        namespace_id = %ctx.namespace_id(),
        namespace_name = %ctx.namespace_name(),
        table_id = %ctx.table_id(),
-        table_name = %ctx.table_name(),
+        table = %ctx.table(),
        partition_id = %ctx.partition_id(),
        partition_key = %ctx.partition_key(),
        %object_store_id,
@ -512,7 +512,7 @@ where
                namespace_id = %ctx.namespace_id(),
                namespace_name = %ctx.namespace_name(),
                table_id = %ctx.table_id(),
-                table_name = %ctx.table_name(),
+                table = %ctx.table(),
                partition_id = %ctx.partition_id(),
                partition_key = %ctx.partition_key(),
                %object_store_id,
--- a/ingester/src/query/exec_instrumentation.rs
+++ b/ingester/src/query/exec_instrumentation.rs
@ -4,9 +4,10 @@ use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
 use iox_time::{SystemProvider, TimeProvider};
 use metric::{DurationHistogram, Metric};
+use predicate::Predicate;
 use trace::span::Span;

-use super::QueryExec;
+use super::{projection::OwnedProjection, QueryExec};
 use crate::query::QueryError;

 /// An instrumentation decorator over a [`QueryExec`] implementation.
@ -62,14 +63,15 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        let t = self.time_provider.now();

        let res = self
            .inner
-            .query_exec(namespace_id, table_id, columns, span)
+            .query_exec(namespace_id, table_id, projection, span, predicate)
            .await;

        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
@ -113,7 +115,7 @@ mod tests {

                    // Call the decorator and assert the return value
                    let got = decorator
-                        .query_exec(NamespaceId::new(42), TableId::new(24), vec![], None)
+                        .query_exec(NamespaceId::new(42), TableId::new(24),OwnedProjection::default(), None, None)
                        .await;
                    assert_matches!(got, $($want_ret)+);

--- a/ingester/src/query/mock_query_exec.rs
+++ b/ingester/src/query/mock_query_exec.rs
@ -1,9 +1,10 @@
 use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
 use parking_lot::Mutex;
+use predicate::Predicate;
 use trace::span::Span;

-use super::{response::QueryResponse, QueryError, QueryExec};
+use super::{projection::OwnedProjection, response::QueryResponse, QueryError, QueryExec};

 #[derive(Debug, Default)]
 pub(crate) struct MockQueryExec {
@ -25,8 +26,9 @@ impl QueryExec for MockQueryExec {
        &self,
        _namespace_id: NamespaceId,
        _table_id: TableId,
-        _columns: Vec<String>,
+        _projection: OwnedProjection,
        _span: Option<Span>,
+        _predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        self.response
            .lock()
--- a/ingester/src/query/mod.rs
+++ b/ingester/src/query/mod.rs
@ -3,6 +3,8 @@
 mod r#trait;
 pub(crate) use r#trait::*;

+pub(crate) mod projection;
+
 // Response types
 pub(crate) mod partition_response;
 pub(crate) mod response;
--- a/ingester/src/query/projection.rs
+++ b/ingester/src/query/projection.rs
@ -0,0 +1,129 @@
+use arrow::record_batch::RecordBatch;
+use mutable_batch::MutableBatch;
+use schema::SchemaBuilder;
+
+/// The private inner type to prevent callers from constructing an empty Subset.
+#[derive(Debug, Default)]
+enum Projection {
+    /// Return all columns.
+    #[default]
+    All,
+
+    /// Return the specified subset of columns.
+    ///
+    /// The returned columns MAY NOT match the specified column order.
+    //
+    // Invariant: subset is never empty - this variant is only constructed when
+    // there is at least one column to project.
+    Project(Vec<String>),
+}
+
+/// Specify the set of columns to project during a query.
+///
+/// Defaults to "all columns".
+#[derive(Debug, Default)]
+pub(crate) struct OwnedProjection(Projection);
+
+impl From<Vec<String>> for OwnedProjection {
+    fn from(value: Vec<String>) -> Self {
+        if value.is_empty() {
+            return Self(Projection::All);
+        }
+
+        Self(Projection::Project(value))
+    }
+}
+
+impl OwnedProjection {
+    /// Copy the data within a [`MutableBatch`] into a [`RecordBatch`], applying
+    /// the the specified projection.
+    ///
+    /// This avoids copying column data for columns that are not part of the
+    /// projection.
+    ///
+    /// NOTE: this copies the underlying column data
+    pub(crate) fn project_mutable_batches(&self, batch: &MutableBatch) -> RecordBatch {
+        // Pre-allocate the outputs to their maximal possible size to avoid
+        // reallocations.
+        let max_capacity = match &self.0 {
+            Projection::All => batch.columns().len(),
+            Projection::Project(s) => s.len(),
+        };
+
+        let mut schema_builder = SchemaBuilder::with_capacity(max_capacity);
+        let mut column_data = Vec::with_capacity(max_capacity);
+
+        // Compute the schema overlap between the requested projection, and the
+        // buffered data.
+        //
+        // Generate the RecordBatch contents in a single pass.
+        match &self.0 {
+            Projection::All => {
+                // If there's no projection, the columns must be emitted ordered
+                // by their name.
+                let mut columns = batch.columns().collect::<Vec<_>>();
+                columns.sort_unstable_by_key(|v| v.0);
+
+                for (name, column) in columns.into_iter() {
+                    schema_builder.influx_column(name, column.influx_type());
+                    column_data.push(column.to_arrow().expect("failed to snapshot buffer data"));
+                }
+            }
+
+            Projection::Project(cols) => {
+                // Invariant: subset is never empty
+                assert!(!cols.is_empty());
+
+                // Construct the schema & data arrays in a single pass, ordered
+                // by the projection and ignoring any missing columns.
+                for name in cols {
+                    if let Ok(column) = batch.column(name) {
+                        schema_builder.influx_column(name, column.influx_type());
+                        column_data
+                            .push(column.to_arrow().expect("failed to snapshot buffer data"));
+                    }
+                }
+            }
+        };
+
+        let schema = schema_builder
+            .build()
+            .expect("failed to create batch schema");
+
+        RecordBatch::try_new(schema.into(), column_data)
+            .expect("failed to generate snapshot record batch")
+    }
+
+    /// Apply the specified projection to `batches`.
+    ///
+    /// This projection requires relatively cheap ref-counting clones and does
+    /// not copy the underlying data.
+    pub(crate) fn project_record_batch(&self, batches: &[RecordBatch]) -> Vec<RecordBatch> {
+        match &self.0 {
+            Projection::All => batches.to_vec(),
+            Projection::Project(columns) => {
+                // Invariant: subset is never empty
+                assert!(!columns.is_empty());
+
+                batches
+                    .iter()
+                    .map(|batch| {
+                        let schema = batch.schema();
+
+                        // Map the column names to column indexes, ignoring
+                        // columns specified in the columns that do not exist
+                        // in this batch.
+                        let projection = columns
+                            .iter()
+                            .flat_map(|column_name| schema.index_of(column_name).ok())
+                            .collect::<Vec<_>>();
+
+                        batch
+                            .project(&projection)
+                            .expect("batch projection failure")
+                    })
+                    .collect()
+            }
+        }
+    }
+}
--- a/ingester/src/query/result_instrumentation.rs
+++ b/ingester/src/query/result_instrumentation.rs
@ -58,6 +58,7 @@ use iox_time::{SystemProvider, Time, TimeProvider};
 use metric::{DurationHistogram, Metric, U64Histogram, U64HistogramOptions};
 use observability_deps::tracing::debug;
 use pin_project::{pin_project, pinned_drop};
+use predicate::Predicate;
 use trace::span::Span;

 use crate::query::{
@ -66,6 +67,8 @@ use crate::query::{
    QueryError, QueryExec,
 };

+use super::projection::OwnedProjection;
+
 /// A [`QueryExec`] decorator adding instrumentation to the [`QueryResponse`]
 /// returned by the inner implementation.
 ///
@ -202,14 +205,17 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        let started_at = self.time_provider.now();

+        // TODO(savage): Would accepting a predicate here require additional
+        // metrics to be added?
        let stream = self
            .inner
-            .query_exec(namespace_id, table_id, columns, span)
+            .query_exec(namespace_id, table_id, projection, span, predicate)
            .await?;

        let stream = QueryMetricContext::new(
@ -467,7 +473,13 @@ mod tests {
            .with_time_provider(Arc::clone(&mock_time));

        let response = layer
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect("query should succeed");

@ -548,7 +560,13 @@ mod tests {
            .with_time_provider(Arc::clone(&mock_time));

        let response = layer
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect("query should succeed");

@ -628,7 +646,13 @@ mod tests {
            .with_time_provider(Arc::clone(&mock_time));

        let response = layer
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect("query should succeed");

@ -708,7 +732,13 @@ mod tests {
            .with_time_provider(Arc::clone(&mock_time));

        let response = layer
-            .query_exec(ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, vec![], None)
+            .query_exec(
+                ARBITRARY_NAMESPACE_ID,
+                ARBITRARY_TABLE_ID,
+                OwnedProjection::default(),
+                None,
+                None,
+            )
            .await
            .expect("query should succeed");

--- a/ingester/src/query/tracing.rs
+++ b/ingester/src/query/tracing.rs
@ -2,9 +2,10 @@ use std::borrow::Cow;

 use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
+use predicate::Predicate;
 use trace::span::{Span, SpanRecorder};

-use super::QueryExec;
+use super::{projection::OwnedProjection, QueryExec};
 use crate::query::QueryError;

 /// An tracing decorator over a [`QueryExec`] implementation.
@ -40,14 +41,21 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        let mut recorder = SpanRecorder::new(span).child(self.name.clone());

        match self
            .inner
-            .query_exec(namespace_id, table_id, columns, recorder.span().cloned())
+            .query_exec(
+                namespace_id,
+                table_id,
+                projection,
+                recorder.span().cloned(),
+                predicate,
+            )
            .await
        {
            Ok(v) => {
@ -109,8 +117,9 @@ mod tests {
            .query_exec(
                NamespaceId::new(42),
                TableId::new(24),
-                vec![],
+                OwnedProjection::default(),
                Some(span.child("root span")),
+                None,
            )
            .await
            .expect("wrapper should not modify result");
@ -132,8 +141,9 @@ mod tests {
            .query_exec(
                NamespaceId::new(42),
                TableId::new(24),
-                vec![],
+                OwnedProjection::default(),
                Some(span.child("root span")),
+                None,
            )
            .await
            .expect_err("wrapper should not modify result");
--- a/ingester/src/query/trait.rs
+++ b/ingester/src/query/trait.rs
@ -2,9 +2,12 @@ use std::{fmt::Debug, ops::Deref, sync::Arc};

 use async_trait::async_trait;
 use data_types::{NamespaceId, TableId};
+use predicate::Predicate;
 use thiserror::Error;
 use trace::span::Span;

+use super::projection::OwnedProjection;
+
 #[derive(Debug, Error)]
 #[allow(missing_copy_implementations)]
 pub(crate) enum QueryError {
@ -23,8 +26,9 @@ pub(crate) trait QueryExec: Send + Sync + Debug {
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError>;
 }

@ -39,11 +43,12 @@ where
        &self,
        namespace_id: NamespaceId,
        table_id: TableId,
-        columns: Vec<String>,
+        projection: OwnedProjection,
        span: Option<Span>,
+        predicate: Option<Predicate>,
    ) -> Result<Self::Response, QueryError> {
        self.deref()
-            .query_exec(namespace_id, table_id, columns, span)
+            .query_exec(namespace_id, table_id, projection, span, predicate)
            .await
    }
 }
--- a/ingester/src/query_adaptor.rs
+++ b/ingester/src/query_adaptor.rs
@ -5,15 +5,13 @@ use std::{any::Any, sync::Arc};

 use arrow::record_batch::RecordBatch;
 use arrow_util::util::ensure_schema;
-use data_types::{ChunkId, ChunkOrder, PartitionId};
-use datafusion::{error::DataFusionError, physical_plan::Statistics};
+use data_types::{ChunkId, ChunkOrder, PartitionId, TimestampMinMax};
+use datafusion::physical_plan::Statistics;
 use iox_query::{
-    exec::{stringset::StringSet, IOxSessionContext},
    util::{compute_timenanosecond_min_max, create_basic_summary},
    QueryChunk, QueryChunkData,
 };
 use once_cell::sync::OnceCell;
-use predicate::Predicate;
 use schema::{merge::merge_record_batch_schemas, sort::SortKey, Projection, Schema};

 /// A queryable wrapper over a set of ordered [`RecordBatch`] snapshot from a
@ -30,7 +28,7 @@ pub struct QueryAdaptor {
    ///
    /// This MUST be non-pub(crate) / closed for modification / immutable to support
    /// interning the merged schema in [`Self::schema()`].
-    data: Vec<Arc<RecordBatch>>,
+    data: Vec<RecordBatch>,

    /// The catalog ID of the partition the this data is part of.
    partition_id: PartitionId,
@ -52,12 +50,12 @@ impl QueryAdaptor {
    ///
    /// This constructor panics if `data` contains no [`RecordBatch`], or all
    /// [`RecordBatch`] are empty.
-    pub(crate) fn new(partition_id: PartitionId, data: Vec<Arc<RecordBatch>>) -> Self {
+    pub(crate) fn new(partition_id: PartitionId, data: Vec<RecordBatch>) -> Self {
        // There must always be at least one record batch and one row.
        //
        // This upholds an invariant that simplifies dealing with empty
        // partitions - if there is a QueryAdaptor, it contains data.
-        assert!(data.iter().map(|b| b.num_rows()).sum::<usize>() > 0);
+        assert!(data.iter().any(|b| b.num_rows() > 0));

        let schema = merge_record_batch_schemas(&data);
        Self {
@ -75,8 +73,7 @@ impl QueryAdaptor {
        // Project the column selection across all RecordBatch
        self.data
            .iter()
-            .map(|data| {
-                let batch = data.as_ref();
+            .map(|batch| {
                let schema = batch.schema();

                // Apply selection to in-memory batch
@ -98,25 +95,40 @@ impl QueryAdaptor {
    }

    /// Returns the [`RecordBatch`] instances in this [`QueryAdaptor`].
-    pub(crate) fn record_batches(&self) -> &[Arc<RecordBatch>] {
+    pub(crate) fn record_batches(&self) -> &[RecordBatch] {
        self.data.as_ref()
    }

+    /// Unwrap this [`QueryAdaptor`], yielding the inner [`RecordBatch`]
+    /// instances.
+    pub(crate) fn into_record_batches(self) -> Vec<RecordBatch> {
+        self.data
+    }
+
    /// Returns the partition ID from which the data this [`QueryAdaptor`] was
    /// sourced from.
    pub(crate) fn partition_id(&self) -> PartitionId {
        self.partition_id
    }
+
+    /// Number of rows, useful for building stats
+    pub(crate) fn num_rows(&self) -> u64 {
+        self.data.iter().map(|b| b.num_rows()).sum::<usize>() as u64
+    }
+
+    /// Time range, useful for building stats
+    pub(crate) fn ts_min_max(&self) -> TimestampMinMax {
+        compute_timenanosecond_min_max(self.data.iter()).expect("Should have time range")
+    }
 }

 impl QueryChunk for QueryAdaptor {
    fn stats(&self) -> Arc<Statistics> {
        Arc::clone(self.stats.get_or_init(|| {
-            let ts_min_max = compute_timenanosecond_min_max(self.data.iter().map(|b| b.as_ref()))
-                .expect("Should have time range");
+            let ts_min_max = self.ts_min_max();

            Arc::new(create_basic_summary(
-                self.data.iter().map(|b| b.num_rows()).sum::<usize>() as u64,
+                self.num_rows(),
                self.schema(),
                ts_min_max,
            ))
@ -147,20 +159,6 @@ impl QueryChunk for QueryAdaptor {
        true
    }

-    /// Return a set of Strings containing the distinct values in the
-    /// specified columns. If the predicate can be evaluated entirely
-    /// on the metadata of this Chunk. Returns `None` otherwise
-    ///
-    /// The requested columns must all have String type.
-    fn column_values(
-        &self,
-        _ctx: IOxSessionContext,
-        _column_name: &str,
-        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        Ok(None)
-    }
-
    fn data(&self) -> QueryChunkData {
        let schema = self.schema().as_arrow();

--- a/ingester/src/server/grpc/query.rs
+++ b/ingester/src/server/grpc/query.rs
@ -12,6 +12,7 @@ use futures::{Stream, StreamExt, TryStreamExt};
 use ingester_query_grpc::influxdata::iox::ingester::v1 as proto;
 use metric::{DurationHistogram, U64Counter};
 use observability_deps::tracing::*;
+use predicate::Predicate;
 use prost::Message;
 use thiserror::Error;
 use tokio::sync::{Semaphore, TryAcquireError};
@ -26,7 +27,7 @@ use instrumentation::FlightFrameEncodeInstrumentation;

 use crate::{
    ingester_id::IngesterId,
-    query::{response::QueryResponse, QueryError, QueryExec},
+    query::{projection::OwnedProjection, response::QueryResponse, QueryError, QueryExec},
 };

 /// Error states for the query RPC handler.
@ -48,6 +49,10 @@ enum Error {
    /// The number of simultaneous queries being executed has been reached.
    #[error("simultaneous query limit exceeded")]
    RequestLimit,
+
+    /// The payload within the request has an invalid field value.
+    #[error("field violation: {0}")]
+    FieldViolation(#[from] ingester_query_grpc::FieldViolation),
 }

 /// Map a query-execution error into a [`tonic::Status`].
@ -77,6 +82,10 @@ impl From<Error> for tonic::Status {
                warn!("simultaneous query limit exceeded");
                Code::ResourceExhausted
            }
+            Error::FieldViolation(_) => {
+                debug!(error=%e, "request contains field violation");
+                Code::InvalidArgument
+            }
        };

        Self::new(code, e.to_string())
@ -188,18 +197,21 @@ where
        let ticket = request.into_inner();
        let request = proto::IngesterQueryRequest::decode(&*ticket.ticket).map_err(Error::from)?;

-        // Extract the namespace/table identifiers
+        // Extract the namespace/table identifiers and the query predicate
        let namespace_id = NamespaceId::new(request.namespace_id);
        let table_id = TableId::new(request.table_id);
+        let predicate = if let Some(p) = request.predicate {
+            debug!(predicate=?p, "received query predicate");
+            Some(Predicate::try_from(p).map_err(Error::from)?)
+        } else {
+            None
+        };

-        // Predicate pushdown is part of the API, but not implemented.
-        if let Some(p) = request.predicate {
-            debug!(predicate=?p, "ignoring query predicate (unsupported)");
-        }
+        let projection = OwnedProjection::from(request.columns);

        let response = match self
            .query_handler
-            .query_exec(namespace_id, table_id, request.columns, span.clone())
+            .query_exec(namespace_id, table_id, projection, span.clone(), predicate)
            .await
        {
            Ok(v) => v,
--- a/ingester/src/test_util.rs
+++ b/ingester/src/test_util.rs
@ -1,6 +1,9 @@
 use std::{collections::BTreeMap, sync::Arc, time::Duration};

-use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, TableId};
+use data_types::{
+    partition_template::TablePartitionTemplateOverride, NamespaceId, PartitionId, PartitionKey,
+    SequenceNumber, TableId,
+};
 use iox_catalog::{interface::Catalog, test_helpers::arbitrary_namespace};
 use lazy_static::lazy_static;
 use mutable_batch_lp::lines_to_batches;
@ -15,8 +18,8 @@ use crate::{
        },
        partition::{PartitionData, SortKeyState},
        table::{
-            name_resolver::{mock::MockTableNameProvider, TableNameProvider},
-            TableName,
+            metadata_resolver::{mock::MockTableProvider, TableProvider},
+            TableMetadata, TableName,
        },
    },
    deferred_load::DeferredLoad,
@ -44,10 +47,15 @@ pub(crate) fn defer_namespace_name_1_ms() -> Arc<DeferredLoad<NamespaceName>> {
    ))
 }

-pub(crate) fn defer_table_name_1_sec() -> Arc<DeferredLoad<TableName>> {
+pub(crate) fn defer_table_metadata_1_sec() -> Arc<DeferredLoad<TableMetadata>> {
    Arc::new(DeferredLoad::new(
        Duration::from_secs(1),
-        async { ARBITRARY_TABLE_NAME.clone() },
+        async {
+            TableMetadata::new_for_testing(
+                ARBITRARY_TABLE_NAME.clone(),
+                TablePartitionTemplateOverride::default(),
+            )
+        },
        &metric::Registry::default(),
    ))
 }
@ -60,8 +68,11 @@ lazy_static! {
    pub(crate) static ref ARBITRARY_NAMESPACE_NAME_PROVIDER: Arc<dyn NamespaceNameProvider> =
        Arc::new(MockNamespaceNameProvider::new(&**ARBITRARY_NAMESPACE_NAME));
    pub(crate) static ref ARBITRARY_TABLE_NAME: TableName = TableName::from("bananas");
-    pub(crate) static ref ARBITRARY_TABLE_NAME_PROVIDER: Arc<dyn TableNameProvider> =
-        Arc::new(MockTableNameProvider::new(&**ARBITRARY_TABLE_NAME));
+    pub(crate) static ref ARBITRARY_TABLE_PROVIDER: Arc<dyn TableProvider> =
+        Arc::new(MockTableProvider::new(TableMetadata::new_for_testing(
+            ARBITRARY_TABLE_NAME.clone(),
+            TablePartitionTemplateOverride::default()
+        )));
 }

 /// Build a [`PartitionData`] with mostly arbitrary-yet-valid values for tests.
@ -71,7 +82,7 @@ pub(crate) struct PartitionDataBuilder {
    partition_key: Option<PartitionKey>,
    namespace_id: Option<NamespaceId>,
    table_id: Option<TableId>,
-    table_name_loader: Option<Arc<DeferredLoad<TableName>>>,
+    table_loader: Option<Arc<DeferredLoad<TableMetadata>>>,
    namespace_loader: Option<Arc<DeferredLoad<NamespaceName>>>,
    sort_key: Option<SortKeyState>,
 }
@ -101,11 +112,11 @@ impl PartitionDataBuilder {
        self
    }

-    pub(crate) fn with_table_name_loader(
+    pub(crate) fn with_table_loader(
        mut self,
-        table_name_loader: Arc<DeferredLoad<TableName>>,
+        table_loader: Arc<DeferredLoad<TableMetadata>>,
    ) -> Self {
-        self.table_name_loader = Some(table_name_loader);
+        self.table_loader = Some(table_loader);
        self
    }

@ -134,8 +145,7 @@ impl PartitionDataBuilder {
            self.namespace_loader
                .unwrap_or_else(defer_namespace_name_1_sec),
            self.table_id.unwrap_or(ARBITRARY_TABLE_ID),
-            self.table_name_loader
-                .unwrap_or_else(defer_table_name_1_sec),
+            self.table_loader.unwrap_or_else(defer_table_metadata_1_sec),
            self.sort_key.unwrap_or(SortKeyState::Provided(None)),
        )
    }
@ -270,7 +280,7 @@ pub(crate) fn make_write_op(
    namespace_id: NamespaceId,
    table_name: &str,
    table_id: TableId,
-    sequence_number: i64,
+    sequence_number: u64,
    lines: &str,
    span_ctx: Option<SpanContext>,
 ) -> WriteOperation {
--- a/ingester/src/timestamp_oracle.rs
+++ b/ingester/src/timestamp_oracle.rs
@ -32,7 +32,7 @@ impl TimestampOracle {
        // or diverge between threads.
        let v = self.0.fetch_add(1, Ordering::Relaxed);

-        SequenceNumber::new(v as i64)
+        SequenceNumber::new(v)
    }
 }

@ -106,6 +106,6 @@ mod tests {
        timestamps
            .into_iter()
            .zip(expected)
-            .for_each(|(got, want)| assert_eq!(got, want as i64));
+            .for_each(|(got, want)| assert_eq!(got, want as u64));
    }
 }
--- a/ingester/src/wal/reference_tracker/handle.rs
+++ b/ingester/src/wal/reference_tracker/handle.rs
@ -248,7 +248,7 @@ mod tests {
    /// Return a [`SequenceNumberSet`] containing `vals`.
    fn new_set<T>(vals: T) -> SequenceNumberSet
    where
-        T: IntoIterator<Item = i64>,
+        T: IntoIterator<Item = u64>,
    {
        vals.into_iter().map(SequenceNumber::new).collect()
    }
@ -257,7 +257,7 @@ mod tests {
    /// [`SequenceNumberSet`] values.
    fn new_note<T>(vals: T) -> Arc<CompletedPersist>
    where
-        T: IntoIterator<Item = i64>,
+        T: IntoIterator<Item = u64>,
    {
        Arc::new(CompletedPersist::new(
            ParquetFileParams {
--- a/ingester/src/wal/wal_sink.rs
+++ b/ingester/src/wal/wal_sink.rs
@ -105,10 +105,7 @@ impl WalAppender for Arc<wal::Wal> {
                let partition_sequence_numbers = w
                    .tables()
                    .map(|(table_id, data)| {
-                        (
-                            *table_id,
-                            data.partitioned_data().sequence_number().get() as u64,
-                        )
+                        (*table_id, data.partitioned_data().sequence_number().get())
                    })
                    .collect::<HashMap<TableId, u64>>();
                (
--- a/ingester/tests/query.rs
+++ b/ingester/tests/query.rs
@ -0,0 +1,162 @@
+use arrow_util::assert_batches_sorted_eq;
+use data_types::PartitionKey;
+use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
+use ingester_test_ctx::TestContextBuilder;
+use metric::{DurationHistogram, U64Histogram};
+
+// Write data to an ingester through the RPC interface and query the data, validating the contents.
+#[tokio::test]
+async fn write_query() {
+    let namespace_name = "write_query_test_namespace";
+    let mut ctx = TestContextBuilder::default().build().await;
+    let ns = ctx.ensure_namespace(namespace_name, None).await;
+
+    // Initial write
+    let partition_key = PartitionKey::from("1970-01-01");
+    ctx.write_lp(
+        namespace_name,
+        "bananas greatness=\"unbounded\" 10",
+        partition_key.clone(),
+        0,
+    )
+    .await;
+
+    // A subsequent write with a non-contiguous sequence number to a different table.
+    ctx.write_lp(
+        namespace_name,
+        "cpu bar=2 20\ncpu bar=3 30",
+        partition_key.clone(),
+        7,
+    )
+    .await;
+
+    // And a third write that appends more data to the table in the initial
+    // write.
+    ctx.write_lp(
+        namespace_name,
+        "bananas count=42 200",
+        partition_key.clone(),
+        42,
+    )
+    .await;
+
+    // Perform a query to validate the actual data buffered.
+    let data: Vec<_> = ctx
+        .query(IngesterQueryRequest {
+            namespace_id: ns.id.get(),
+            table_id: ctx.table_id(namespace_name, "bananas").await.get(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query request failed");
+
+    let expected = vec![
+        "+-------+-----------+--------------------------------+",
+        "| count | greatness | time                           |",
+        "+-------+-----------+--------------------------------+",
+        "|       | unbounded | 1970-01-01T00:00:00.000000010Z |",
+        "| 42.0  |           | 1970-01-01T00:00:00.000000200Z |",
+        "+-------+-----------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // Assert various ingest metrics.
+    let hist = ctx
+        .get_metric::<DurationHistogram, _>(
+            "ingester_dml_sink_apply_duration",
+            &[("handler", "write_apply"), ("result", "success")],
+        )
+        .fetch();
+    assert_eq!(hist.sample_count(), 3);
+
+    // Read metrics
+    let hist = ctx
+        .get_metric::<DurationHistogram, _>(
+            "ingester_query_stream_duration",
+            &[("request", "complete")],
+        )
+        .fetch();
+    assert_eq!(hist.sample_count(), 1);
+
+    let hist = ctx
+        .get_metric::<U64Histogram, _>("ingester_query_result_row", &[])
+        .fetch();
+    assert_eq!(hist.sample_count(), 1);
+    assert_eq!(hist.total, 2);
+}
+
+// Write data to an ingester through the RPC interface and query the data, validating the contents.
+#[tokio::test]
+async fn write_query_projection() {
+    let namespace_name = "write_query_test_namespace";
+    let mut ctx = TestContextBuilder::default().build().await;
+    let ns = ctx.ensure_namespace(namespace_name, None).await;
+
+    // Initial write
+    let partition_key = PartitionKey::from("1970-01-01");
+    ctx.write_lp(
+        namespace_name,
+        "bananas greatness=\"unbounded\",level=42 10",
+        partition_key.clone(),
+        0,
+    )
+    .await;
+
+    // Another write that appends more data to the table in the initial write.
+    ctx.write_lp(
+        namespace_name,
+        "bananas count=42,level=4242 200",
+        partition_key.clone(),
+        42,
+    )
+    .await;
+
+    // Perform a query to validate the actual data buffered.
+    let data: Vec<_> = ctx
+        .query(IngesterQueryRequest {
+            namespace_id: ns.id.get(),
+            table_id: ctx.table_id(namespace_name, "bananas").await.get(),
+            columns: vec![],
+            predicate: None,
+        })
+        .await
+        .expect("query request failed");
+
+    let expected = vec![
+        "+-------+-----------+--------+--------------------------------+",
+        "| count | greatness | level  | time                           |",
+        "+-------+-----------+--------+--------------------------------+",
+        "|       | unbounded | 42.0   | 1970-01-01T00:00:00.000000010Z |",
+        "| 42.0  |           | 4242.0 | 1970-01-01T00:00:00.000000200Z |",
+        "+-------+-----------+--------+--------------------------------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+
+    // And perform a query with projection, selecting a column that is entirely
+    // non-NULL, a column containing NULLs (in a different order to the above)
+    // and a column that does not exist.
+    let data: Vec<_> = ctx
+        .query(IngesterQueryRequest {
+            namespace_id: ns.id.get(),
+            table_id: ctx.table_id(namespace_name, "bananas").await.get(),
+            columns: vec![
+                "level".to_string(),
+                "greatness".to_string(),
+                "platanos".to_string(),
+            ],
+            predicate: None,
+        })
+        .await
+        .expect("query request failed");
+
+    let expected = vec![
+        "+--------+-----------+",
+        "| level  | greatness |",
+        "+--------+-----------+",
+        "| 42.0   | unbounded |",
+        "| 4242.0 |           |",
+        "+--------+-----------+",
+    ];
+    assert_batches_sorted_eq!(&expected, &data);
+}
--- a/ingester/tests/write.rs
+++ b/ingester/tests/write.rs
@ -10,88 +10,6 @@ use metric::{
 use parquet_file::ParquetFilePath;
 use std::{sync::Arc, time::Duration};

-// Write data to an ingester through the RPC interface and query the data, validating the contents.
-#[tokio::test]
-async fn write_query() {
-    let namespace_name = "write_query_test_namespace";
-    let mut ctx = TestContextBuilder::default().build().await;
-    let ns = ctx.ensure_namespace(namespace_name, None).await;
-
-    // Initial write
-    let partition_key = PartitionKey::from("1970-01-01");
-    ctx.write_lp(
-        namespace_name,
-        "bananas greatness=\"unbounded\" 10",
-        partition_key.clone(),
-        0,
-    )
-    .await;
-
-    // A subsequent write with a non-contiguous sequence number to a different table.
-    ctx.write_lp(
-        namespace_name,
-        "cpu bar=2 20\ncpu bar=3 30",
-        partition_key.clone(),
-        7,
-    )
-    .await;
-
-    // And a third write that appends more data to the table in the initial
-    // write.
-    ctx.write_lp(
-        namespace_name,
-        "bananas count=42 200",
-        partition_key.clone(),
-        42,
-    )
-    .await;
-
-    // Perform a query to validate the actual data buffered.
-    let data: Vec<_> = ctx
-        .query(IngesterQueryRequest {
-            namespace_id: ns.id.get(),
-            table_id: ctx.table_id(namespace_name, "bananas").await.get(),
-            columns: vec![],
-            predicate: None,
-        })
-        .await
-        .expect("query request failed");
-
-    let expected = vec![
-        "+-------+-----------+--------------------------------+",
-        "| count | greatness | time                           |",
-        "+-------+-----------+--------------------------------+",
-        "|       | unbounded | 1970-01-01T00:00:00.000000010Z |",
-        "| 42.0  |           | 1970-01-01T00:00:00.000000200Z |",
-        "+-------+-----------+--------------------------------+",
-    ];
-    assert_batches_sorted_eq!(&expected, &data);
-
-    // Assert various ingest metrics.
-    let hist = ctx
-        .get_metric::<DurationHistogram, _>(
-            "ingester_dml_sink_apply_duration",
-            &[("handler", "write_apply"), ("result", "success")],
-        )
-        .fetch();
-    assert_eq!(hist.sample_count(), 3);
-
-    // Read metrics
-    let hist = ctx
-        .get_metric::<DurationHistogram, _>(
-            "ingester_query_stream_duration",
-            &[("request", "complete")],
-        )
-        .fetch();
-    assert_eq!(hist.sample_count(), 1);
-
-    let hist = ctx
-        .get_metric::<U64Histogram, _>("ingester_query_result_row", &[])
-        .fetch();
-    assert_eq!(hist.sample_count(), 1);
-    assert_eq!(hist.total, 2);
-}
-
 // Write data to an ingester through the RPC interface and persist the data.
 #[tokio::test]
 async fn write_persist() {
--- a/ingester_test_ctx/src/lib.rs
+++ b/ingester_test_ctx/src/lib.rs
@ -242,7 +242,7 @@ where
        namespace: &str,
        lp: &str,
        partition_key: PartitionKey,
-        sequence_number: i64,
+        sequence_number: u64,
    ) {
        // Resolve the namespace ID needed to construct the DML op
        let namespace_id = self.namespace_id(namespace).await;
--- a/iox_catalog/Cargo.toml
+++ b/iox_catalog/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies] # In alphabetical order
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 data_types = { path = "../data_types" }
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
@ -20,7 +20,7 @@ siphasher = "0.3"
 snafu = "0.7"
 sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid", "sqlite" ] }
 sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tokio = { version = "1.29", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
@ -30,7 +30,7 @@ assert_matches = "1.5.0"
 dotenvy = "0.15.7"
 generated_types = { path = "../generated_types" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-paste = "1.0.12"
+paste = "1.0.13"
 pretty_assertions = "1.3.0"
 rand = "0.8"
 tempfile = "3"
--- a/iox_catalog/src/metrics.rs
+++ b/iox_catalog/src/metrics.rs
@ -179,8 +179,8 @@ decorate!(
        "partition_list_skipped_compactions" = list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>>;
        "partition_delete_skipped_compactions" = delete_skipped_compactions(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
        "partition_most_recent_n" = most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>>;
-        "partitions_new_file_between" = partitions_new_file_between(&mut self, minimum_time: Timestamp, maximum_time: Option<Timestamp>) -> Result<Vec<PartitionId>>;
-        "get_in_skipped_compaction" = get_in_skipped_compaction(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
+        "partition_partitions_new_file_between" = partitions_new_file_between(&mut self, minimum_time: Timestamp, maximum_time: Option<Timestamp>) -> Result<Vec<PartitionId>>;
+        "partition_get_in_skipped_compaction" = get_in_skipped_compaction(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
    ]
 );

@ -195,7 +195,7 @@ decorate!(
        "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>>;
        "parquet_list_by_partition_not_to_delete" = list_by_partition_not_to_delete(&mut self, partition_id: PartitionId) -> Result<Vec<ParquetFile>>;
        "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: Uuid) -> Result<Option<ParquetFile>>;
-        "exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<Uuid>) -> Result<Vec<Uuid>>;
+        "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<Uuid>) -> Result<Vec<Uuid>>;
        "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, delete: &[ParquetFileId], upgrade: &[ParquetFileId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result<Vec<ParquetFileId>>;
    ]
 );
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@ -23,7 +23,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
 regex = "1.8"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 snafu = "0.7"
 tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 toml = "0.7.5"
--- a/iox_query/src/chunk_statistics.rs
+++ b/iox_query/src/chunk_statistics.rs
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@ -22,7 +22,7 @@ use arrow::{
 use async_trait::async_trait;
 use data_types::{ChunkId, ChunkOrder, PartitionId};
 use datafusion::{error::DataFusionError, physical_plan::Statistics, prelude::SessionContext};
-use exec::{stringset::StringSet, IOxSessionContext};
+use exec::IOxSessionContext;
 use hashbrown::HashMap;
 use observability_deps::tracing::trace;
 use once_cell::sync::Lazy;
@ -34,6 +34,7 @@ use schema::{
 };
 use std::{any::Any, fmt::Debug, sync::Arc};

+pub mod chunk_statistics;
 pub mod config;
 pub mod exec;
 pub mod frontend;
@ -81,18 +82,6 @@ pub trait QueryChunk: Debug + Send + Sync + 'static {
    /// key" within itself
    fn may_contain_pk_duplicates(&self) -> bool;

-    /// Return a set of Strings containing the distinct values in the
-    /// specified columns. If the predicate can be evaluated entirely
-    /// on the metadata of this Chunk. Returns `None` otherwise
-    ///
-    /// The requested columns must all have String type.
-    fn column_values(
-        &self,
-        ctx: IOxSessionContext,
-        column_name: &str,
-        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError>;
-
    /// Provides access to raw [`QueryChunk`] data.
    ///
    /// The engine assume that minimal work shall be performed to gather the `QueryChunkData`.
@ -271,15 +260,6 @@ where
        self.as_ref().may_contain_pk_duplicates()
    }

-    fn column_values(
-        &self,
-        ctx: IOxSessionContext,
-        column_name: &str,
-        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        self.as_ref().column_values(ctx, column_name, predicate)
-    }
-
    fn data(&self) -> QueryChunkData {
        self.as_ref().data()
    }
@ -323,15 +303,6 @@ impl QueryChunk for Arc<dyn QueryChunk> {
        self.as_ref().may_contain_pk_duplicates()
    }

-    fn column_values(
-        &self,
-        ctx: IOxSessionContext,
-        column_name: &str,
-        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        self.as_ref().column_values(ctx, column_name, predicate)
-    }
-
    fn data(&self) -> QueryChunkData {
        self.as_ref().data()
    }
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@ -1120,18 +1120,6 @@ impl QueryChunk for TestChunk {
        "Test Chunk"
    }

-    fn column_values(
-        &self,
-        _ctx: IOxSessionContext,
-        _column_name: &str,
-        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        self.check_error()?;
-
-        // Model not being able to get column values from metadata
-        Ok(None)
-    }
-
    fn order(&self) -> ChunkOrder {
        self.order
    }
--- a/iox_query_influxql/Cargo.toml
+++ b/iox_query_influxql/Cargo.toml
@ -20,7 +20,7 @@ predicate = { path = "../predicate" }
 query_functions = { path = "../query_functions" }
 regex = "1"
 schema = { path = "../schema" }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 thiserror = "1.0"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

--- a/iox_query_influxql/src/plan/rewriter.rs
+++ b/iox_query_influxql/src/plan/rewriter.rs
@ -2028,7 +2028,7 @@ mod test {
        use crate::plan::ir::TagSet;
        use datafusion::common::Result;
        use influxdb_influxql_parser::select::SelectStatement;
-        use schema::{InfluxColumnType, InfluxFieldType};
+        use schema::{InfluxColumnType, InfluxFieldType, SchemaBuilder};

        /// Test implementation that converts `Select` to `SelectStatement` so that it can be
        /// converted back to a string.
@ -2647,7 +2647,18 @@ mod test {
        /// Projections which contain function calls
        #[test]
        fn projection_call_expr() {
-            let namespace = MockSchemaProvider::default();
+            let mut namespace = MockSchemaProvider::default();
+            // Add a schema with tags that could conflict with aliasing against an
+            // existing call expression, in this case "last"
+            namespace.add_schema(
+                SchemaBuilder::new()
+                    .measurement("conflicts")
+                    .timestamp()
+                    .tag("last")
+                    .influx_field("field_f64", InfluxFieldType::Float)
+                    .build()
+                    .unwrap(),
+            );

            let stmt = parse_select("SELECT COUNT(field_i64) FROM temp_01");
            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
@ -2694,6 +2705,14 @@ mod test {
                stmt.to_string(),
                "SELECT time::timestamp AS time, sum(field_f64::float) AS sum_field_f64, sum(field_i64::integer) AS sum_field_i64, sum(field_u64::unsigned) AS sum_field_u64, sum(shared_field0::float) AS sum_shared_field0 FROM temp_01"
            );
+
+            // Handles conflicts when call expression is renamed to match an existing tag
+            let stmt = parse_select("SELECT LAST(field_f64), last FROM conflicts");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, last(field_f64::float) AS last, last::tag AS last_1 FROM conflicts"
+            );
        }
    }

--- a/iox_query_influxrpc/src/lib.rs
+++ b/iox_query_influxrpc/src/lib.rs
@ -66,9 +66,6 @@ const CONCURRENT_TABLE_JOBS: usize = 10;

 #[derive(Debug, Snafu)]
 pub enum Error {
-    #[snafu(display("gRPC planner got error finding column values: {}", source))]
-    FindingColumnValues { source: DataFusionError },
-
    #[snafu(display(
        "gRPC planner got error fetching chunks for table '{}': {}",
        table_name,
@ -180,7 +177,6 @@ impl Error {
            | Self::BuildingPlan { source, .. }
            | Self::ReadColumns { source, .. }
            | Self::CheckingChunkPredicate { source, .. }
-            | Self::FindingColumnValues { source, .. }
            | Self::CastingAggregates { source, .. } => {
                DataFusionError::Context(format!("{method}: {msg}"), Box::new(source))
            }
@ -480,7 +476,6 @@ impl InfluxRpcPlanner {
        )
        .and_then(|(table_name, table_schema, predicate, chunks)| async move {
            let mut chunks_full = vec![];
-            let mut known_values = BTreeSet::new();

            let chunks = prune_chunks(&table_schema, chunks, &predicate);
            for chunk in cheap_chunk_first(chunks) {
@ -513,36 +508,15 @@ impl InfluxRpcPlanner {
                    }
                );

-                // try and get the list of values directly from metadata
-                let mut ctx = self.ctx.child_ctx("tag_values execution");
-                ctx.set_metadata("table", table_name.to_string());
-
-                let maybe_values = chunk
-                    .column_values(ctx, tag_name, &predicate)
-                    .context(FindingColumnValuesSnafu)?;
-
-                match maybe_values {
-                    Some(mut names) => {
-                        debug!(
-                            %table_name,
-                            names=?names,
-                            chunk_id=%chunk.id().get(),
-                            "tag values found from metadata",
-                        );
-                        known_values.append(&mut names);
-                    }
-                    None => {
-                        debug!(
-                            %table_name,
-                            chunk_id=%chunk.id().get(),
-                            "need full plan to find tag values"
-                        );
-                        chunks_full.push(chunk);
-                    }
-                }
+                debug!(
+                    %table_name,
+                    chunk_id=%chunk.id().get(),
+                    "need full plan to find tag values"
+                );
+                chunks_full.push(chunk);
            }

-            Ok((table_name, predicate, chunks_full, known_values))
+            Ok((table_name, predicate, chunks_full))
        })
        .try_collect()
        .await?;
@ -554,9 +528,7 @@ impl InfluxRpcPlanner {
        // At this point, we have a set of tag_values we know at plan
        // time in `known_columns`, and some tables in chunks that we
        // need to run a plan to find what values pass the predicate.
-        for (table_name, predicate, chunks_full, known_values) in tables {
-            builder = builder.append_other(known_values.into());
-
+        for (table_name, predicate, chunks_full) in tables {
            if !chunks_full.is_empty() {
                let schema = namespace
                    .table_schema(table_name)
--- a/ioxd_common/Cargo.toml
+++ b/ioxd_common/Cargo.toml
@ -12,14 +12,14 @@ license.workspace = true
 authz = { path = "../authz", features = ["http"] }
 clap_blocks = { path = "../clap_blocks" }
 generated_types = { path = "../generated_types" }
-heappy = { git = "https://github.com/mkmik/heappy", rev = "1d6ac77a4026fffce8680a7b31a9f6e9859b5e73", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true }
+heappy = { git = "https://github.com/mkmik/heappy", rev = "1de977a241cdd768acc5b6c82c0728b30c7db7b4", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true }
 metric = { path = "../metric" }
 metric_exporters = { path = "../metric_exporters" }
 observability_deps = { path = "../observability_deps" }
 # NOTE: we may not notice that we need the "backtrace-rs" feature if we also build with the heappy feature, which depends on backtrace-rs.
 # (honestly I thought that cargo dependencies were isolated on a per crate basis so I'm a bit surprised that pprof accidentally builds
 # successfully just because another crate happens to depend on backtrace-rs)
-pprof = { version = "0.11", default-features = false, features = ["flamegraph", "prost-codec"], optional = true }
+pprof = { version = "0.12", default-features = false, features = ["flamegraph", "prost-codec"], optional = true }
 service_grpc_testing = { path = "../service_grpc_testing" }
 trace = { path = "../trace" }
 trace_exporters = { path = "../trace_exporters" }
@ -38,7 +38,7 @@ log = "0.4"
 parking_lot = "0.12"
 reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 serde_urlencoded = "0.7.0"
 snafu = "0.7"
 tokio = { version = "1.29", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
--- a/ioxd_ingester/Cargo.toml
+++ b/ioxd_ingester/Cargo.toml
@ -18,7 +18,7 @@ iox_query = { version = "0.1.0", path = "../iox_query" }
 ioxd_common = { path = "../ioxd_common" }
 metric = { path = "../metric" }
 parquet_file = { version = "0.1.0", path = "../parquet_file" }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tokio = { version = "1.29", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-util = { version = "0.7.8" }
 trace = { path = "../trace" }
--- a/ioxd_querier/Cargo.toml
+++ b/ioxd_querier/Cargo.toml
@ -30,7 +30,7 @@ trace = { path = "../trace" }
 arrow-flight = { workspace = true }
 async-trait = "0.1"
 hyper = "0.14"
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tokio = { version = "1.29", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/ioxd_router/Cargo.toml
+++ b/ioxd_router/Cargo.toml
@ -18,7 +18,7 @@ metric = { path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
 object_store = { workspace = true }
 router = { path = "../router" }
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 tokio-util = { version = "0.7.8" }
 trace = { path = "../trace" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/mutable_batch/Cargo.toml
+++ b/mutable_batch/Cargo.toml
@ -18,12 +18,12 @@ hashbrown = { workspace = true }
 itertools = "0.11"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 percent-encoding = "2.2.0"
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 unicode-segmentation = "1.10.1"

 [dev-dependencies]
 assert_matches = "1.5.0"
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-paste = "1.0.12"
+paste = "1.0.13"
 proptest = { version = "1.2.0", default-features = false }
 rand = "0.8"
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@ -6,13 +6,13 @@ edition.workspace = true
 license.workspace = true

 [dependencies] # In alphabetical order
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 bytes = "1.4"
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 metric = { version = "0.1.0", path = "../metric" }
 object_store = { workspace = true }
-pin-project = "1.1.1"
+pin-project = "1.1.2"
 tokio = { version = "1.29", features = ["io-util"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }

--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@ -22,7 +22,7 @@ pbjson-types = "0.5"
 prost = "0.11"
 schema = { path = "../schema" }
 snafu = "0.7"
-thiserror = "1.0.40"
+thiserror = "1.0.41"
 thrift = "0.17"
 tokio = { version = "1.29", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] }
 uuid = { version = "1", features = ["v4"] }
--- a/querier/Cargo.toml
+++ b/querier/Cargo.toml
@ -8,7 +8,7 @@ license.workspace = true
 [dependencies]
 arrow = { workspace = true }
 arrow-flight = { workspace = true }
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 backoff = { path = "../backoff" }
 bytes = "1.4"
 cache_system = { path = "../cache_system" }
--- a/querier/src/cache/parquet_file.rs
+++ b/querier/src/cache/parquet_file.rs
@ -361,8 +361,8 @@ mod tests {
        partition.create_parquet_file(builder).await;
        let table_id = table.table.id;

-        let single_file_size = 208;
-        let two_file_size = 384;
+        let single_file_size = 240;
+        let two_file_size = 448;
        assert!(single_file_size < two_file_size);

        let cache = make_cache(&catalog);
--- a/querier/src/cache/partition.rs
+++ b/querier/src/cache/partition.rs
@ -17,6 +17,7 @@ use data_types::{
 };
 use datafusion::scalar::ScalarValue;
 use iox_catalog::interface::Catalog;
+use iox_query::chunk_statistics::{ColumnRange, ColumnRanges};
 use iox_time::TimeProvider;
 use observability_deps::tracing::debug;
 use schema::sort::SortKey;
@ -27,8 +28,6 @@ use std::{
 };
 use trace::span::Span;

-use crate::df_stats::{ColumnRange, ColumnRanges};
-
 use super::{namespace::CachedTable, ram::RamSize};

 const CACHE_ID: &str = "partition";
--- a/querier/src/ingester/mod.rs
+++ b/querier/src/ingester/mod.rs
@ -6,24 +6,21 @@ use self::{
    invalidate_on_error::InvalidateOnErrorFlightClient,
    test_util::MockIngesterConnection,
 };
-use crate::{
-    cache::{namespace::CachedTable, CatalogCache},
-    df_stats::{create_chunk_statistics, ColumnRanges},
-};
+use crate::cache::{namespace::CachedTable, CatalogCache};
 use arrow::{datatypes::DataType, error::ArrowError, record_batch::RecordBatch};
 use arrow_flight::decode::DecodedPayload;
 use async_trait::async_trait;
 use backoff::{Backoff, BackoffConfig, BackoffError};
 use client_util::connection;
 use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionHashId, PartitionId};
-use datafusion::{error::DataFusionError, physical_plan::Statistics};
+use datafusion::physical_plan::Statistics;
 use futures::{stream::FuturesUnordered, TryStreamExt};
 use ingester_query_grpc::{
    encode_proto_predicate_as_base64, influxdata::iox::ingester::v1::IngesterQueryResponseMetadata,
    IngesterQueryRequest,
 };
 use iox_query::{
-    exec::{stringset::StringSet, IOxSessionContext},
+    chunk_statistics::{create_chunk_statistics, ColumnRanges},
    util::compute_timenanosecond_min_max,
    QueryChunk, QueryChunkData,
 };
@ -941,16 +938,6 @@ impl QueryChunk for IngesterChunk {
        true
    }

-    fn column_values(
-        &self,
-        _ctx: IOxSessionContext,
-        _column_name: &str,
-        _predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        // TODO maybe some special handling?
-        Ok(None)
-    }
-
    fn data(&self) -> QueryChunkData {
        QueryChunkData::RecordBatches(self.batches.clone())
    }
--- a/querier/src/lib.rs
+++ b/querier/src/lib.rs
@ -18,7 +18,6 @@ use workspace_hack as _;

 mod cache;
 mod database;
-mod df_stats;
 mod ingester;
 mod namespace;
 mod parquet;
--- a/querier/src/parquet/mod.rs
+++ b/querier/src/parquet/mod.rs
@ -2,6 +2,7 @@

 use data_types::{ChunkId, ChunkOrder, PartitionId};
 use datafusion::physical_plan::Statistics;
+use iox_query::chunk_statistics::{create_chunk_statistics, ColumnRanges};
 use parquet_file::chunk::ParquetChunk;
 use schema::sort::SortKey;
 use std::sync::Arc;
@ -11,8 +12,6 @@ mod query_access;

 pub use creation::ChunkAdapter;

-use crate::df_stats::{create_chunk_statistics, ColumnRanges};
-
 /// Immutable metadata attached to a [`QuerierParquetChunk`].
 #[derive(Debug)]
 pub struct QuerierParquetChunkMeta {
--- a/querier/src/parquet/query_access.rs
+++ b/querier/src/parquet/query_access.rs
@ -1,11 +1,7 @@
 use crate::parquet::QuerierParquetChunk;
 use data_types::{ChunkId, ChunkOrder, PartitionId};
-use datafusion::{error::DataFusionError, physical_plan::Statistics};
-use iox_query::{
-    exec::{stringset::StringSet, IOxSessionContext},
-    QueryChunk, QueryChunkData,
-};
-use predicate::Predicate;
+use datafusion::physical_plan::Statistics;
+use iox_query::{QueryChunk, QueryChunkData};
 use schema::{sort::SortKey, Schema};
 use std::{any::Any, sync::Arc};

@ -34,21 +30,6 @@ impl QueryChunk for QuerierParquetChunk {
        false
    }

-    fn column_values(
-        &self,
-        mut ctx: IOxSessionContext,
-        column_name: &str,
-        predicate: &Predicate,
-    ) -> Result<Option<StringSet>, DataFusionError> {
-        ctx.set_metadata("column_name", column_name.to_string());
-        ctx.set_metadata("predicate", format!("{}", &predicate));
-        ctx.set_metadata("storage", "parquet");
-
-        // Since DataFusion can read Parquet, there is no advantage to
-        // manually implementing this vs just letting DataFusion do its thing
-        Ok(None)
-    }
-
    fn data(&self) -> QueryChunkData {
        QueryChunkData::Parquet(self.parquet_chunk.parquet_exec_input())
    }
--- a/querier/src/table/mod.rs
+++ b/querier/src/table/mod.rs
@ -492,7 +492,6 @@ mod tests {
    use super::*;
    use crate::{
        cache::test_util::{assert_cache_access_metric_count, assert_catalog_access_metric_count},
-        df_stats::ColumnRange,
        ingester::{test_util::MockIngesterConnection, IngesterPartition},
        table::test_util::{querier_table, IngesterPartitionBuilder},
    };
@ -506,7 +505,7 @@ mod tests {
    use generated_types::influxdata::iox::partition_template::v1::{
        template_part::Part, PartitionTemplate, TemplatePart,
    };
-    use iox_query::exec::IOxSessionContext;
+    use iox_query::{chunk_statistics::ColumnRange, exec::IOxSessionContext};
    use iox_tests::{TestCatalog, TestParquetFileBuilder, TestTable};
    use predicate::Predicate;
    use schema::{builder::SchemaBuilder, InfluxFieldType, TIME_COLUMN_NAME};
--- a/querier/src/table/test_util.rs
+++ b/querier/src/table/test_util.rs
@ -1,11 +1,12 @@
 use super::{PruneMetrics, QuerierTable, QuerierTableArgs};
 use crate::{
-    cache::CatalogCache, create_ingester_connection_for_testing, df_stats::ColumnRanges,
-    parquet::ChunkAdapter, IngesterPartition,
+    cache::CatalogCache, create_ingester_connection_for_testing, parquet::ChunkAdapter,
+    IngesterPartition,
 };
 use arrow::record_batch::RecordBatch;
 use data_types::ChunkId;
 use iox_catalog::interface::{get_schema_by_name, SoftDeletedRows};
+use iox_query::chunk_statistics::ColumnRanges;
 use iox_tests::{TestCatalog, TestPartition, TestTable};
 use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
 use schema::{Projection, Schema};
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -49,7 +49,7 @@ criterion = { version = "0.5", default-features = false, features = ["async_toki
 influxdb-line-protocol = { path = "../influxdb_line_protocol" }
 iox_tests = { path = "../iox_tests" }
 once_cell = "1"
-paste = "1.0.12"
+paste = "1.0.13"
 pretty_assertions = "1.3.0"
 proptest = { version = "1.2.0", default-features = false }
 rand = "0.8.3"
--- a/schema/src/builder.rs
+++ b/schema/src/builder.rs
@ -32,6 +32,14 @@ impl SchemaBuilder {
        Self::default()
    }

+    pub fn with_capacity(n: usize) -> Self {
+        Self {
+            measurement: Default::default(),
+            fields: Vec::with_capacity(n),
+            finished: Default::default(),
+        }
+    }
+
    /// Add a new tag column to this schema. By default tags are
    /// potentially nullable as they are not guaranteed to be present
    /// for all rows
--- a/schema/src/merge.rs
+++ b/schema/src/merge.rs
@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use arrow::{datatypes::Field, record_batch::RecordBatch};
 use hashbrown::hash_map::RawEntryMut;
 use hashbrown::HashMap;
@ -44,7 +42,7 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 /// This is infallable because the schemas of chunks within a
 /// partition are assumed to be compatible because that schema was
 /// enforced as part of writing into the partition
-pub fn merge_record_batch_schemas(batches: &[Arc<RecordBatch>]) -> Schema {
+pub fn merge_record_batch_schemas(batches: &[RecordBatch]) -> Schema {
    let mut merger = SchemaMerger::new();
    for batch in batches {
        let schema = Schema::try_from(batch.schema()).expect("Schema conversion error");
@ -172,6 +170,8 @@ impl<'a> SchemaMerger<'a> {

 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
    use crate::builder::SchemaBuilder;
    use crate::InfluxFieldType::Integer;

--- a/service_common/Cargo.toml
+++ b/service_common/Cargo.toml
@ -6,7 +6,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies] # In alphabetical order
-async-trait = "0.1.68"
+async-trait = "0.1.70"
 bytes = "1.4"
 datafusion = { workspace = true }
 iox_query = { path = "../iox_query" }
--- a/service_grpc_flight/Cargo.toml
+++ b/service_grpc_flight/Cargo.toml
@ -26,7 +26,7 @@ bytes = "1.4"
 futures = "0.3"
 prost = "0.11"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.99"
+serde_json = "1.0.100"
 snafu = "0.7"
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
--- a/Show More
+++ b/Show More