feat: correctly sort data and store in catalog metadata (#3864)

* feat: respect sort order in ChunkTableProvider (#3214) feat: persist sort order in catalog (#3845) refactor: owned SortKey (#3845) * fix: size tests * refactor: immutable SortKey * test: test sort order restart (#3845) * chore: explicit None for sort key * chore: test cleanup * fix: handling of sort keys containing fields * chore: remove unused selected_sort_key * chore: more docs Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
2022-02-25 17:56:27 +00:00 · 2022-02-25 17:56:27 +00:00 · 2a842fbb1a
parent 8edc462c37
commit 2a842fbb1a
29 changed files with 619 additions and 1338 deletions
--- a/db/src/catalog/chunk.rs
+++ b/db/src/catalog/chunk.rs
@ -14,7 +14,7 @@ use observability_deps::tracing::debug;
 use parking_lot::Mutex;
 use parquet_file::chunk::ParquetChunk;
 use read_buffer::RBChunk;
-use schema::{selection::Selection, Schema, TIME_COLUMN_NAME};
+use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};
 use snafu::Snafu;
 use std::sync::Arc;
 use time::{Time, TimeProvider};
@ -88,6 +88,9 @@ pub struct ChunkMetadata {
    /// into IOx. Note due to the compaction, etc... this may not be the chunk
    /// that data was originally written into
    pub time_of_last_write: Time,
+
+    /// Sort key for this chunk
+    pub sort_key: Option<SortKey>,
 }

 /// Different memory representations of a frozen chunk.
@ -458,11 +461,11 @@ impl CatalogChunk {
        self.order
    }

-    pub fn schema(&self) -> Arc<Schema> {
+    pub fn sort_key(&self) -> Option<&SortKey> {
        match &self.stage {
-            ChunkStage::Open { mb_chunk, .. } => Arc::new(mb_chunk.schema(Selection::All).unwrap()),
+            ChunkStage::Open { .. } => None,
            ChunkStage::Frozen { meta, .. } | ChunkStage::Persisted { meta, .. } => {
-                Arc::clone(&meta.schema)
+                meta.sort_key.as_ref()
            }
        }
    }
@ -532,6 +535,7 @@ impl CatalogChunk {
                    delete_predicates: del_preds,
                    time_of_first_write: meta.time_of_first_write,
                    time_of_last_write: meta.time_of_last_write,
+                    sort_key: meta.sort_key.clone(),
                });
            }
        }
@ -762,6 +766,7 @@ impl CatalogChunk {
                    delete_predicates,
                    time_of_first_write: *time_of_first_write,
                    time_of_last_write: *time_of_last_write,
+                    sort_key: None,
                };

                self.stage = ChunkStage::Frozen {
@ -1293,6 +1298,7 @@ mod tests {
            delete_predicates: vec![],
            time_of_first_write: now,
            time_of_last_write: now,
+            sort_key: metadata.sort_key,
        };

        CatalogChunk::new_object_store_only(
--- a/db/src/chunk.rs
+++ b/db/src/chunk.rs
@ -121,6 +121,7 @@ impl DbChunk {
                    delete_predicates: vec![], // open chunk does not have delete predicate
                    time_of_first_write: *time_of_first_write,
                    time_of_last_write: *time_of_last_write,
+                    sort_key: None,
                };
                (state, Arc::new(meta))
            }
@ -535,20 +536,6 @@ impl QueryChunk for DbChunk {
        }
    }

-    /// Returns true if the chunk is sorted on its pk
-    /// Since data is compacted prior being moved to RUBs, data in RUBs and OBs
-    /// should be sorted on their PK as the results of compacting.
-    /// However, since we current sorted data based on their cardinality (see compute_sort_key),
-    /// 2 different chunks may be sorted on different order of key columns.
-    fn is_sorted_on_pk(&self) -> bool {
-        self.schema().is_sorted_on_pk()
-    }
-
-    /// Returns the sort key of the chunk if any
-    fn sort_key(&self) -> Option<SortKey<'_>> {
-        self.meta.schema.sort_key()
-    }
-
    fn chunk_type(&self) -> &str {
        match &self.state {
            State::MutableBuffer { .. } => "MUB",
@ -571,6 +558,10 @@ impl QueryChunkMeta for DbChunk {
        Arc::clone(&self.meta.schema)
    }

+    fn sort_key(&self) -> Option<&SortKey> {
+        self.meta.sort_key.as_ref()
+    }
+
    // return a reference to delete predicates of the chunk
    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        let pred = &self.meta.delete_predicates;
--- a/db/src/lib.rs
+++ b/db/src/lib.rs
@ -1697,7 +1697,7 @@ mod tests {
            .id();

        // A chunk is now in the object store and still in read buffer
-        let expected_parquet_size = 1247;
+        let expected_parquet_size = 1257;
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", expected_read_buffer_size);
        // now also in OS
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", expected_parquet_size);
@ -2128,7 +2128,7 @@ mod tests {
        // Read buffer + Parquet chunk size
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 1700);
-        catalog_chunk_size_bytes_metric_eq(registry, "object_store", 1248);
+        catalog_chunk_size_bytes_metric_eq(registry, "object_store", 1259);

        // All the chunks should have different IDs
        assert_ne!(mb_chunk.id(), rb_chunk.id());
@ -2245,7 +2245,7 @@ mod tests {
        let registry = test_db.metric_registry.as_ref();

        // Read buffer + Parquet chunk size
-        let object_store_bytes = 1248;
+        let object_store_bytes = 1259;
        catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 0);
        catalog_chunk_size_bytes_metric_eq(registry, "read_buffer", 1700);
        catalog_chunk_size_bytes_metric_eq(registry, "object_store", object_store_bytes);
@ -2725,8 +2725,8 @@ mod tests {
                id: chunk_summaries[0].id,
                storage: ChunkStorage::ReadBufferAndObjectStore,
                lifecycle_action,
-                memory_bytes: 4088,       // size of RB and OS chunks
-                object_store_bytes: 1557, // size of parquet file
+                memory_bytes: 4102,       // size of RB and OS chunks
+                object_store_bytes: 1573, // size of parquet file
                row_count: 2,
                time_of_last_access: None,
                time_of_first_write: Time::from_timestamp_nanos(1),
@ -2776,7 +2776,7 @@ mod tests {

        assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 1463);
        assert_eq!(db.catalog.metrics().memory().read_buffer(), 2550);
-        assert_eq!(db.catalog.metrics().memory().object_store(), 1538);
+        assert_eq!(db.catalog.metrics().memory().object_store(), 1552);
    }

    #[tokio::test]
--- a/db/src/lifecycle/compact.rs
+++ b/db/src/lifecycle/compact.rs
@ -91,8 +91,8 @@ pub(crate) fn compact_chunks(
        let summaries = query_chunks
            .iter()
            .map(|x| x.summary().expect("Chunk should have summary"));
-        let key = compute_sort_key(summaries);
-        let key_str = format!("\"{}\"", key); // for logging
+
+        let sort_key = compute_sort_key(summaries);

        // build schema
        //
@ -102,9 +102,11 @@ pub(crate) fn compact_chunks(
        // partitions).
        let schema = merge_schemas(&query_chunks);

-        // Cannot move query_chunks as the sort key borrows the column names
-        let (schema, plan) =
-            ReorgPlanner::new().compact_plan(schema, query_chunks.iter().map(Arc::clone), key)?;
+        let plan = ReorgPlanner::new().compact_plan(
+            Arc::clone(&schema),
+            query_chunks,
+            sort_key.clone(),
+        )?;

        let physical_plan = ctx.prepare_plan(&plan).await?;
        let stream = ctx.execute_stream(physical_plan).await?;
@ -148,6 +150,7 @@ pub(crate) fn compact_chunks(
            delete_predicates,
            time_of_first_write,
            time_of_last_write,
+            sort_key: Some(sort_key.clone()),
        };

        let (_, chunk) = partition.create_rub_chunk(None, max_order, metadata, rb_chunk);
@ -158,7 +161,7 @@ pub(crate) fn compact_chunks(

        info!(input_chunks=chunk_ids.len(), %rub_row_groups,
                        %input_rows, %output_rows,
-                        sort_key=%key_str, compaction_took = ?elapsed, fut_execution_duration= ?fut_now.elapsed(),
+                        %sort_key, compaction_took = ?elapsed, fut_execution_duration= ?fut_now.elapsed(),
                        rows_per_sec=?throughput,  "chunk(s) compacted");

        let snapshot = DbChunk::snapshot(&chunk.read());
--- a/db/src/lifecycle/compact_object_store.rs
+++ b/db/src/lifecycle/compact_object_store.rs
@ -33,6 +33,7 @@ use parquet_file::{
 };
 use persistence_windows::checkpoint::{DatabaseCheckpoint, PartitionCheckpoint};
 use query::{compute_sort_key, exec::ExecutorType, frontend::reorg::ReorgPlanner, QueryChunkMeta};
+use schema::sort::SortKey;
 use schema::Schema;
 use snafu::{OptionExt, ResultExt};
 use std::{
@ -85,6 +86,7 @@ pub(crate) fn compact_object_store_chunks(
    // The partition will be unlock after the chunks are marked and snaphot
    let compacting_os_chunks =
        mark_chunks_to_compact(partition, chunks, &registration, compacted_chunk_id)?;
+
    let delete_predicates_before = compacting_os_chunks.delete_predicates;

    let fut = async move {
@ -116,7 +118,9 @@ pub(crate) fn compact_object_store_chunks(
                time_of_first_write: compacting_os_chunks.time_of_first_write,
                time_of_last_write: compacting_os_chunks.time_of_last_write,
                chunk_order: compacting_os_chunks.max_order,
+                sort_key: Some(sort_key.clone()),
            };
+
            let compacted_and_persisted_chunk = persist_stream_to_chunk(
                &db,
                &partition_addr,
@ -145,6 +149,7 @@ pub(crate) fn compact_object_store_chunks(
            compacted_and_persisted_chunk.clone(),
            compacting_os_chunks.partition,
            delete_predicates_before,
+            sort_key.clone(),
        )
        .await;

@ -339,16 +344,15 @@ async fn compact_chunks(db: &Db, query_chunks: &[Arc<DbChunk>]) -> Result<Compac
        .iter()
        .map(|x| x.summary().expect("Chunk should have summary"));
    let sort_key = compute_sort_key(summaries);
-    let sort_key_str = format!("\"{}\"", sort_key); // for logging

    // Merge schema of the compacting chunks
    let merged_schema = merge_schemas(query_chunks);

    // Build compact query plan
-    let (plan_schema, plan) = ReorgPlanner::new().compact_plan(
+    let plan = ReorgPlanner::new().compact_plan(
        Arc::clone(&merged_schema),
        query_chunks.iter().map(Arc::clone),
-        sort_key,
+        sort_key.clone(),
    )?;
    let physical_plan = ctx.prepare_plan(&plan).await?;

@ -357,8 +361,8 @@ async fn compact_chunks(db: &Db, query_chunks: &[Arc<DbChunk>]) -> Result<Compac

    Ok(CompactedStream {
        stream,
-        schema: plan_schema,
-        sort_key: sort_key_str,
+        schema: merged_schema,
+        sort_key,
    })
 }

@ -366,7 +370,7 @@ async fn compact_chunks(db: &Db, query_chunks: &[Arc<DbChunk>]) -> Result<Compac
 struct CompactedStream {
    stream: SendableRecordBatchStream,
    schema: Arc<Schema>,
-    sort_key: String,
+    sort_key: SortKey,
 }

 /// Persist a provided stream to a new OS chunk
@ -443,6 +447,7 @@ async fn update_in_memory_catalog(
    parquet_chunk: Option<Arc<ParquetChunk>>,
    partition: Arc<RwLock<Partition>>,
    delete_predicates_before: HashSet<Arc<DeletePredicate>>,
+    sort_key: SortKey,
 ) -> Option<Arc<DbChunk>> {
    // Acquire write lock to drop the old chunks while also getting delete predicates added during compaction
    let mut partition = partition.write();
@ -474,6 +479,7 @@ async fn update_in_memory_catalog(
                table_summary: Arc::clone(parquet_chunk.table_summary()),
                schema: parquet_chunk.schema(),
                delete_predicates,
+                sort_key: Some(sort_key),
                time_of_first_write: iox_metadata.time_of_first_write,
                time_of_last_write: iox_metadata.time_of_last_write,
            };
--- a/db/src/lifecycle/persist.rs
+++ b/db/src/lifecycle/persist.rs
@ -95,6 +95,8 @@ pub fn persist_chunks(
            return Ok(None);
        }

+        let query_chunk_len = query_chunks.len();
+
        let time_of_first_write =
            time_of_first_write.expect("Should have had a first write somewhere");

@ -104,17 +106,16 @@ pub fn persist_chunks(
        let summaries = query_chunks
            .iter()
            .map(|x| x.summary().expect("Chunk should have summary"));
-        let key = compute_sort_key(summaries);
-        let key_str = format!("\"{}\"", key); // for logging
+
+        let sort_key = compute_sort_key(summaries);

        // build schema
        let schema = merge_schemas(&query_chunks);

-        // Cannot move query_chunks as the sort key borrows the column names
-        let (schema, plan) = ReorgPlanner::new().split_plan(
-            schema,
-            query_chunks.iter().map(Arc::clone),
-            key,
+        let plan = ReorgPlanner::new().split_plan(
+            Arc::clone(&schema),
+            query_chunks,
+            sort_key.clone(),
            flush_timestamp,
        )?;

@ -170,6 +171,7 @@ pub fn persist_chunks(
                    delete_predicates: delete_predicates.clone(),
                    time_of_first_write,
                    time_of_last_write,
+                    sort_key: Some(sort_key.clone()),
                };

                partition_write.create_rub_chunk(None, max_order, metadata, remainder);
@ -193,6 +195,7 @@ pub fn persist_chunks(
                delete_predicates,
                time_of_first_write,
                time_of_last_write,
+                sort_key: Some(sort_key.clone()),
            };

            let (new_chunk_id, new_chunk) = partition_write.create_rub_chunk(
@ -220,9 +223,9 @@ pub fn persist_chunks(
        // input rows per second
        let throughput = (input_rows as u128 * 1_000_000_000) / elapsed.as_nanos();

-        info!(input_chunks=query_chunks.len(),
+        info!(input_chunks=query_chunk_len,
              input_rows, persisted_rows, remainder_rows,
-              sort_key=%key_str, compaction_took = ?elapsed,
+              sort_key=%sort_key, compaction_took = ?elapsed,
              ?max_persistable_timestamp,
              rows_per_sec=?throughput,  "chunk(s) persisted");

--- a/db/src/lifecycle/write.rs
+++ b/db/src/lifecycle/write.rs
@ -60,6 +60,7 @@ pub(super) fn write_chunk_to_object_store(
    let table_name = Arc::clone(&addr.table_name);
    let partition_key = Arc::clone(&addr.partition_key);
    let chunk_order = chunk.order();
+    let sort_key = chunk.sort_key().cloned();
    let delete_predicates = chunk.delete_predicates().to_vec();

    let (tracker, registration) = db.jobs.register(Job::WriteChunk {
@ -134,6 +135,7 @@ pub(super) fn write_chunk_to_object_store(
                time_of_first_write,
                time_of_last_write,
                chunk_order,
+                sort_key,
            };

            let written_result = timeout(
--- a/db/src/load.rs
+++ b/db/src/load.rs
@ -255,6 +255,7 @@ impl CatalogState for Loader {
            delete_predicates,
            time_of_first_write: iox_md.time_of_first_write,
            time_of_last_write: iox_md.time_of_last_write,
+            sort_key: iox_md.sort_key,
        };

        partition.insert_object_store_only_chunk(
--- a/db/src/system_tables/columns.rs
+++ b/db/src/system_tables/columns.rs
@ -12,6 +12,7 @@ use data_types::{
    error::ErrorLogger,
    partition_metadata::{ColumnSummary, PartitionSummary, TableSummary},
 };
+use schema::sort::SortKey;

 use crate::{
    catalog::Catalog,
@ -130,7 +131,7 @@ impl IoxSystemTable for ChunkColumnsTable {
                (
                    chunk.table_summary(),
                    chunk.detailed_summary(),
-                    chunk.schema(),
+                    chunk.sort_key().cloned(),
                )
            });

@ -159,7 +160,7 @@ fn chunk_columns_schema() -> SchemaRef {

 fn assemble_chunk_columns(
    schema: SchemaRef,
-    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary, Arc<schema::Schema>)>,
+    chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary, Option<SortKey>)>,
 ) -> Result<RecordBatch> {
    // Create an iterator over each column in each table in each chunk
    // so we can build  `chunk_columns` column by column
@ -171,16 +172,14 @@ fn assemble_chunk_columns(

    let rows = chunk_summaries
        .iter()
-        .map(|(table_summary, chunk_summary, schema)| {
-            let sort_key = schema.sort_key().unwrap_or_default();
-
+        .map(|(table_summary, chunk_summary, sort_key)| {
            table_summary
                .columns
                .iter()
                .map(move |column_summary| EachColumn {
                    chunk_summary,
                    column_summary,
-                    column_sort: sort_key.get(&column_summary.name),
+                    column_sort: sort_key.as_ref().and_then(|x| x.get(&column_summary.name)),
                })
        })
        .flatten()
@ -300,7 +299,6 @@ mod tests {
        chunk_metadata::{ChunkColumnSummary, ChunkId, ChunkOrder, ChunkStorage, ChunkSummary},
        partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics},
    };
-    use schema::builder::SchemaBuilder;
    use schema::sort::SortKey;
    use time::Time;

@ -367,25 +365,7 @@ mod tests {
    fn test_assemble_chunk_columns() {
        let lifecycle_action = None;

-        let mut sort1 = SortKey::default();
-        sort1.push("c2", Default::default());
-        sort1.push("c1", Default::default());
-
-        let schema1 = SchemaBuilder::new()
-            .field("c1", DataType::Utf8)
-            .field("c2", DataType::Float64)
-            .build_with_sort_key(&sort1)
-            .unwrap();
-
-        let schema2 = SchemaBuilder::new()
-            .field("c1", DataType::Float64)
-            .build()
-            .unwrap();
-
-        let schema3 = SchemaBuilder::new()
-            .field("c3", DataType::Float64)
-            .build()
-            .unwrap();
+        let sort = SortKey::from_columns(vec!["c2", "c1"]);

        let summaries = vec![
            (
@ -435,7 +415,7 @@ mod tests {
                        },
                    ],
                },
-                Arc::new(schema1),
+                Some(sort),
            ),
            (
                Arc::new(TableSummary {
@ -466,7 +446,7 @@ mod tests {
                        memory_bytes: 100,
                    }],
                },
-                Arc::new(schema2),
+                None,
            ),
            (
                Arc::new(TableSummary {
@ -497,7 +477,7 @@ mod tests {
                        memory_bytes: 200,
                    }],
                },
-                Arc::new(schema3),
+                None,
            ),
        ];

--- a/generated_types/protos/influxdata/iox/preserved_catalog/v1/parquet_metadata.proto
+++ b/generated_types/protos/influxdata/iox/preserved_catalog/v1/parquet_metadata.proto
@ -39,6 +39,9 @@ message IoxMetadata {

  // Order of this chunk relative to other overlapping chunks.
  uint32 chunk_order = 10;
+
+  // The sort key of this chunk
+  SortKey sort_key = 12;
 }

 // Partition checkpoint.
@ -81,3 +84,21 @@ message OptionalMinMaxSequence {
  OptionalUint64 min = 1;
  uint64 max = 2;
 }
+
+message SortKey {
+  // A sort expression
+  message Expr {
+    // The name of the column
+    string column = 1;
+
+    /// Whether the data is sorted in descending order
+    bool descending = 2;
+
+    /// Whether the data is sorted with nulls first
+    bool nulls_first = 3;
+  }
+
+  repeated Expr expressions = 1;
+}
+
+
--- a/influxdb_iox/tests/end_to_end_cases/persistence.rs
+++ b/influxdb_iox/tests/end_to_end_cases/persistence.rs
@ -259,7 +259,11 @@ async fn create_readbuffer_chunk(fixture: &ServerFixture, db_name: &str) -> Chun

    let partition_key = "cpu";
    let table_name = "cpu";
-    let lp_lines = vec!["cpu,region=west user=23.2 100"];
+    let lp_lines = vec![
+        "cpu,region=west,host=a user=23.2 100",
+        "cpu,region=west,host=b user=34.2 100",
+        "cpu,region=east,host=c user=54.2 100",
+    ];

    write_client
        .write_lp(db_name, lp_lines.join("\n"), 0)
@ -310,7 +314,7 @@ async fn create_readbuffer_chunk(fixture: &ServerFixture, db_name: &str) -> Chun

 async fn assert_chunk_query_works(fixture: &ServerFixture, db_name: &str) {
    let mut client = fixture.flight_client();
-    let sql_query = "select region, user, time from cpu";
+    let sql_query = "select region, host, user, time from cpu";

    let batches = client
        .perform_query(db_name, sql_query)
@ -321,12 +325,35 @@ async fn assert_chunk_query_works(fixture: &ServerFixture, db_name: &str) {
        .unwrap();

    let expected_read_data = vec![
-        "+--------+------+--------------------------------+",
-        "| region | user | time                           |",
-        "+--------+------+--------------------------------+",
-        "| west   | 23.2 | 1970-01-01T00:00:00.000000100Z |",
-        "+--------+------+--------------------------------+",
+        "+--------+------+------+--------------------------------+",
+        "| region | host | user | time                           |",
+        "+--------+------+------+--------------------------------+",
+        "| east   | c    | 54.2 | 1970-01-01T00:00:00.000000100Z |",
+        "| west   | a    | 23.2 | 1970-01-01T00:00:00.000000100Z |",
+        "| west   | b    | 34.2 | 1970-01-01T00:00:00.000000100Z |",
+        "+--------+------+------+--------------------------------+",
    ];

    assert_batches_eq!(expected_read_data, &batches);
+
+    let batches = client
+        .perform_query(db_name, "select column_name, row_count, null_count, min_value, max_value, sort_ordinal from system.chunk_columns")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected_columns = vec![
+        "+-------------+-----------+------------+-----------+-----------+--------------+",
+        "| column_name | row_count | null_count | min_value | max_value | sort_ordinal |",
+        "+-------------+-----------+------------+-----------+-----------+--------------+",
+        "| host        | 3         | 0          | a         | c         | 1            |",
+        "| region      | 3         | 0          | east      | west      | 0            |",
+        "| time        | 3         | 0          | 100       | 100       | 2            |",
+        "| user        | 3         | 0          | 23.2      | 54.2      |              |",
+        "+-------------+-----------+------------+-----------+-----------+--------------+",
+    ];
+
+    assert_batches_eq!(expected_columns, &batches);
 }
--- a/influxdb_iox/tests/end_to_end_cases/system_tables.rs
+++ b/influxdb_iox/tests/end_to_end_cases/system_tables.rs
@ -27,7 +27,7 @@ async fn test_operations() {
    write_client
        .write_lp(&db_name1, lp_lines.join("\n"), 0)
        .await
-        .expect("write succeded");
+        .expect("write succeeded");

    let chunks = list_chunks(&fixture, &db_name1).await;
    let chunk_id = chunks[0].id;
--- a/ingester/src/query.rs
+++ b/ingester/src/query.rs
@ -129,6 +129,10 @@ impl QueryChunkMeta for QueryableBatch {
        merge_record_batch_schemas(&batches)
    }

+    fn sort_key(&self) -> Option<&SortKey> {
+        None
+    }
+
    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        self.delete_predicates.as_ref()
    }
@ -270,16 +274,6 @@ impl QueryChunk for QueryableBatch {
        Ok(Box::pin(stream))
    }

-    /// Returns true if data of this chunk is sorted
-    fn is_sorted_on_pk(&self) -> bool {
-        false
-    }
-
-    /// Returns the sort key of the chunk if any
-    fn sort_key(&self) -> Option<SortKey<'_>> {
-        None
-    }
-
    /// Returns chunk type
    fn chunk_type(&self) -> &str {
        "PersistingBatch"
--- a/parquet_catalog/src/dump.rs
+++ b/parquet_catalog/src/dump.rs
@ -490,6 +490,7 @@ File {
                            chunk_order: ChunkOrder(
                                1,
                            ),
+                            sort_key: None,
                        },
                    ),
                    schema: Ok(
--- a/parquet_catalog/src/rebuild.rs
+++ b/parquet_catalog/src/rebuild.rs
@ -380,6 +380,7 @@ mod tests {
            time_of_first_write: Time::from_timestamp_nanos(0),
            time_of_last_write: Time::from_timestamp_nanos(0),
            chunk_order: ChunkOrder::new(5).unwrap(),
+            sort_key: None,
        };
        let stream: SendableRecordBatchStream = Box::pin(MemoryStream::new(record_batches));
        let (path, file_size_bytes, metadata) = storage
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@ -113,6 +113,7 @@ use persistence_windows::{
    min_max_sequence::OptionalMinMaxSequence,
 };
 use prost::Message;
+use schema::sort::{SortKey, SortKeyBuilder};
 use schema::{InfluxColumnType, InfluxFieldType, Schema};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{collections::BTreeMap, convert::TryInto, sync::Arc};
@ -293,6 +294,9 @@ pub struct IoxMetadataOld {

    /// Order of this chunk relative to other overlapping chunks.
    pub chunk_order: ChunkOrder,
+
+    /// Sort key of this chunk
+    pub sort_key: Option<SortKey>,
 }

 impl IoxMetadataOld {
@ -332,6 +336,7 @@ impl IoxMetadataOld {
                .context(IoxMetadataFieldMissingSnafu {
                    field: "partition_checkpoint",
                })?;
+
        let sequencer_numbers = proto_partition_checkpoint
            .sequencer_numbers
            .into_iter()
@ -345,6 +350,7 @@ impl IoxMetadataOld {
                }
            })
            .collect::<Result<BTreeMap<u32, OptionalMinMaxSequence>>>()?;
+
        let flush_timestamp = decode_timestamp_from_field(
            proto_partition_checkpoint.flush_timestamp,
            "partition_checkpoint.flush_timestamp",
@ -363,6 +369,7 @@ impl IoxMetadataOld {
                .context(IoxMetadataFieldMissingSnafu {
                    field: "database_checkpoint",
                })?;
+
        let sequencer_numbers = proto_database_checkpoint
            .sequencer_numbers
            .into_iter()
@ -376,8 +383,17 @@ impl IoxMetadataOld {
                }
            })
            .collect::<Result<BTreeMap<u32, OptionalMinMaxSequence>>>()?;
+
        let database_checkpoint = DatabaseCheckpoint::new(sequencer_numbers);

+        let sort_key = proto_msg.sort_key.map(|proto_key| {
+            let mut builder = SortKeyBuilder::with_capacity(proto_key.expressions.len());
+            for expr in proto_key.expressions {
+                builder = builder.with_col_opts(expr.column, expr.descending, expr.nulls_first)
+            }
+            builder.build()
+        });
+
        Ok(Self {
            creation_timestamp,
            time_of_first_write,
@ -395,6 +411,7 @@ impl IoxMetadataOld {
                    field: "chunk_order".to_string(),
                }
            })?,
+            sort_key,
        })
    }

@ -442,6 +459,20 @@ impl IoxMetadataOld {
                .collect(),
        };

+        let sort_key = self
+            .sort_key
+            .as_ref()
+            .map(|key| preserved_catalog::SortKey {
+                expressions: key
+                    .iter()
+                    .map(|(name, options)| preserved_catalog::sort_key::Expr {
+                        column: name.to_string(),
+                        descending: options.descending,
+                        nulls_first: options.nulls_first,
+                    })
+                    .collect(),
+            });
+
        let proto_msg = preserved_catalog::IoxMetadata {
            version: METADATA_VERSION,
            creation_timestamp: Some(self.creation_timestamp.date_time().into()),
@ -453,6 +484,7 @@ impl IoxMetadataOld {
            partition_checkpoint: Some(proto_partition_checkpoint),
            database_checkpoint: Some(proto_database_checkpoint),
            chunk_order: self.chunk_order.get(),
+            sort_key,
        };

        let mut buf = Vec::new();
@ -1203,6 +1235,7 @@ mod tests {
            time_of_first_write: Time::from_timestamp(3234, 0),
            time_of_last_write: Time::from_timestamp(3234, 3456),
            chunk_order: ChunkOrder::new(5).unwrap(),
+            sort_key: None,
        };

        let proto_bytes = metadata.to_protobuf().unwrap();
--- a/parquet_file/src/storage.rs
+++ b/parquet_file/src/storage.rs
@ -420,6 +420,7 @@ mod tests {
            time_of_first_write: Time::from_timestamp_nanos(456),
            time_of_last_write: Time::from_timestamp_nanos(43069346),
            chunk_order: ChunkOrder::new(5).unwrap(),
+            sort_key: None,
        };

        // create parquet file
@ -495,6 +496,7 @@ mod tests {
            time_of_first_write: Time::from_timestamp_nanos(234),
            time_of_last_write: Time::from_timestamp_nanos(4784),
            chunk_order: ChunkOrder::new(5).unwrap(),
+            sort_key: None,
        };

        let (path, _file_size_bytes, _metadata) = storage
--- a/parquet_file/src/test_utils/generator.rs
+++ b/parquet_file/src/test_utils/generator.rs
@ -92,6 +92,7 @@ impl ChunkGenerator {
            database_checkpoint,
            time_of_first_write: Time::from_timestamp(30, 40),
            time_of_last_write: Time::from_timestamp(50, 60),
+            sort_key: None,
        };

        let (record_batches, schema, column_summaries, rows) = match self.config {
--- a/query/src/frontend.rs
+++ b/query/src/frontend.rs
@ -6,7 +6,6 @@ pub mod sql;
 mod test {
    use std::sync::Arc;

-    use arrow::compute::SortOptions;
    use datafusion::physical_plan::{
        metrics::{self, MetricValue},
        ExecutionPlan, ExecutionPlanVisitor,
@ -60,18 +59,10 @@ mod test {
    #[tokio::test]
    async fn test_metrics() {
        let (schema, chunks) = get_test_chunks();
-
-        let mut sort_key = SortKey::with_capacity(1);
-        sort_key.push(
-            "time",
-            SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        );
+        let sort_key = SortKey::from_columns(vec!["time", "tag1"]);

        // Use a split plan as it has StreamSplitExec, DeduplicateExec and IOxReadFilternode
-        let (_, split_plan) = ReorgPlanner::new()
+        let split_plan = ReorgPlanner::new()
            .split_plan(schema, chunks, sort_key, 1000)
            .expect("created compact plan");

--- a/query/src/frontend/reorg.rs
+++ b/query/src/frontend/reorg.rs
@ -3,7 +3,7 @@
 use std::sync::Arc;

 use datafusion::logical_plan::{col, lit_timestamp_nano, Expr, LogicalPlan, LogicalPlanBuilder};
-use observability_deps::tracing::{debug, trace};
+use observability_deps::tracing::debug;
 use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};

 use crate::{
@ -112,8 +112,8 @@ impl ReorgPlanner {
        &self,
        schema: Arc<Schema>,
        chunks: I,
-        output_sort: SortKey<'_>,
-    ) -> Result<(Arc<Schema>, LogicalPlan)>
+        sort_key: SortKey,
+    ) -> Result<LogicalPlan>
    where
        C: QueryChunk + 'static,
        I: IntoIterator<Item = Arc<C>>,
@ -121,29 +121,13 @@ impl ReorgPlanner {
        let ScanPlan {
            plan_builder,
            provider,
-        } = self.sorted_scan_plan(schema, chunks)?;
-
-        let mut schema = provider.iox_schema();
-
-        // Set the sort_key of the schema to the compacted chunk's sort key
-        // Try to do this only if the sort key changes so we avoid unnecessary schema copies.
-        trace!(input_schema=?schema, "Setting sort key on schema for compact plan");
-        if schema
-            .sort_key()
-            .map_or(true, |existing_key| existing_key != output_sort)
-        {
-            let mut schema_cloned = schema.as_ref().clone();
-            schema_cloned.set_sort_key(&output_sort);
-            schema = Arc::new(schema_cloned);
-        }
-        trace!(output_schema=?schema, "Setting sort key on schema for compact plan");
-
+        } = self.sorted_scan_plan(schema, chunks, sort_key)?;
        let plan = plan_builder.build().context(BuildingPlanSnafu)?;

        debug!(table_name=provider.table_name(), plan=%plan.display_indent_schema(),
               "created compact plan for table");

-        Ok((schema, plan))
+        Ok(plan)
    }

    /// Creates an execution plan for the SPLIT operations which does the following:
@ -194,9 +178,9 @@ impl ReorgPlanner {
        &self,
        schema: Arc<Schema>,
        chunks: I,
-        output_sort: SortKey<'_>,
+        sort_key: SortKey,
        split_time: i64,
-    ) -> Result<(Arc<Schema>, LogicalPlan)>
+    ) -> Result<LogicalPlan>
    where
        C: QueryChunk + 'static,
        I: IntoIterator<Item = Arc<C>>,
@ -204,46 +188,33 @@ impl ReorgPlanner {
        let ScanPlan {
            plan_builder,
            provider,
-        } = self.sorted_scan_plan(schema, chunks)?;
-
-        let mut schema = provider.iox_schema();
-
-        // Set output_sort as the sort_key of the schema
-        // Try to do this only if the sort key changes so we avoid unnecessary schema copies.
-        trace!(input_schema=?schema, "Setting sort key on schema for split plan");
-        if schema
-            .sort_key()
-            .map_or(true, |existing_key| existing_key != output_sort)
-        {
-            let mut schema_cloned = schema.as_ref().clone();
-            schema_cloned.set_sort_key(&output_sort);
-            schema = Arc::new(schema_cloned);
-        }
-        trace!(output_schema=?schema, "Setting sort key on schema for split plan");
+        } = self.sorted_scan_plan(schema, chunks, sort_key)?;

        // time <= split_time
        let split_expr = col(TIME_COLUMN_NAME).lt_eq(lit_timestamp_nano(split_time));

        let plan = plan_builder.build().context(BuildingPlanSnafu)?;
-
        let plan = make_stream_split(plan, split_expr);

        debug!(table_name=provider.table_name(), plan=%plan.display_indent_schema(),
               "created split plan for table");

-        Ok((schema, plan))
+        Ok(plan)
    }

    /// Creates a scan plan for the given set of chunks.
+    ///
    /// Output data of the scan will be deduplicated sorted if `sort=true` on
    /// the optimal sort order of the chunks' PK columns (tags and time).
    ///
-    /// The optimal sort order is computed based on the PK columns cardinality
-    /// that will be best for RLE encoding.
-    ///
    /// Refer to query::provider::build_scan_plan for the detail of the plan
    ///
-    fn sorted_scan_plan<C, I>(&self, schema: Arc<Schema>, chunks: I) -> Result<ScanPlan<C>>
+    fn sorted_scan_plan<C, I>(
+        &self,
+        schema: Arc<Schema>,
+        chunks: I,
+        sort_key: SortKey,
+    ) -> Result<ScanPlan<C>>
    where
        C: QueryChunk + 'static,
        I: IntoIterator<Item = Arc<C>>,
@ -256,12 +227,11 @@ impl ReorgPlanner {
        let table_name = &table_name;

        // Prepare the plan for the table
-        let mut builder = ProviderBuilder::new(table_name, schema);
-        // Tell the scan of this provider to sort its output on the chunks' PK
-        builder.ensure_pk_sort();
-
-        // There are no predicates in these plans, so no need to prune them
-        builder = builder.add_no_op_pruner();
+        let mut builder = ProviderBuilder::new(table_name, schema)
+            // There are no predicates in these plans, so no need to prune them
+            .add_no_op_pruner()
+            // Tell the scan of this provider to sort its output on the chunks' PK
+            .with_sort_key(sort_key);

        for chunk in chunks {
            // check that it is consistent with this table_name
@ -278,6 +248,7 @@ impl ReorgPlanner {
        let provider = builder
            .build()
            .context(CreatingProviderSnafu { table_name })?;
+
        let provider = Arc::new(provider);

        // Scan all columns
@ -301,10 +272,10 @@ struct ScanPlan<C: QueryChunk + 'static> {

 #[cfg(test)]
 mod test {
-    use arrow::compute::SortOptions;
    use arrow_util::assert_batches_eq;
    use datafusion_util::{test_collect, test_collect_partition};
    use schema::merge::SchemaMerger;
+    use schema::sort::SortKeyBuilder;

    use crate::{
        exec::{Executor, ExecutorType},
@ -413,23 +384,12 @@ mod test {

        let (schema, chunks) = get_test_chunks().await;

-        let mut sort_key = SortKey::with_capacity(2);
-        sort_key.push(
-            "tag1",
-            SortOptions {
-                descending: true,
-                nulls_first: true,
-            },
-        );
-        sort_key.push(
-            "time",
-            SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        );
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("tag1", true, true)
+            .with_col_opts(TIME_COLUMN_NAME, false, false)
+            .build();

-        let (_, compact_plan) = ReorgPlanner::new()
+        let compact_plan = ReorgPlanner::new()
            .compact_plan(schema, chunks, sort_key)
            .expect("created compact plan");

@ -453,14 +413,14 @@ mod test {
            "+-----------+------------+------+--------------------------------+",
            "| field_int | field_int2 | tag1 | time                           |",
            "+-----------+------------+------+--------------------------------+",
-            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
-            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z    |",
            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z    |",
-            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
-            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |",
-            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
            "+-----------+------------+------+--------------------------------+",
        ];

@ -474,17 +434,13 @@ mod test {
        // the operator is tested in its own module.
        let (schema, chunks) = get_test_chunks().await;

-        let mut sort_key = SortKey::with_capacity(1);
-        sort_key.push(
-            "time",
-            SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        );
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("time", false, false)
+            .with_col_opts("tag1", false, true)
+            .build();

        // split on 1000 should have timestamps 1000, 5000, and 7000
-        let (_, split_plan) = ReorgPlanner::new()
+        let split_plan = ReorgPlanner::new()
            .split_plan(schema, chunks, sort_key, 1000)
            .expect("created compact plan");

@ -519,16 +475,16 @@ mod test {

        let batches1 = test_collect_partition(physical_plan, 1).await;

-        // Sorted on state (tag1) ASC and time
+        // Sorted on time
        let expected = vec![
            "+-----------+------------+------+-----------------------------+",
            "| field_int | field_int2 | tag1 | time                        |",
            "+-----------+------------+------+-----------------------------+",
            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z |",
            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z |",
-            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z |",
-            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z |",
            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z |",
            "+-----------+------------+------+-----------------------------+",
        ];

--- a/query/src/lib.rs
+++ b/query/src/lib.rs
@ -34,6 +34,7 @@ pub mod statistics;
 pub mod util;

 pub use exec::context::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
+use schema::sort::SortKeyBuilder;

 /// Trait for an object (designed to be a Chunk) which can provide
 /// metadata
@ -44,6 +45,9 @@ pub trait QueryChunkMeta: Sized {
    /// return a reference to the summary of the data held in this chunk
    fn schema(&self) -> Arc<Schema>;

+    /// return a reference to the sort key if any
+    fn sort_key(&self) -> Option<&SortKey>;
+
    /// return a reference to delete predicates of the chunk
    fn delete_predicates(&self) -> &[Arc<DeletePredicate>];

@ -207,12 +211,6 @@ pub trait QueryChunk: QueryChunkMeta + Debug + Send + Sync {
        selection: Selection<'_>,
    ) -> Result<SendableRecordBatchStream, Self::Error>;

-    /// Returns true if data of this chunk is sorted
-    fn is_sorted_on_pk(&self) -> bool;
-
-    /// Returns the sort key of the chunk if any
-    fn sort_key(&self) -> Option<SortKey<'_>>;
-
    /// Returns chunk type which is either MUB, RUB, OS
    fn chunk_type(&self) -> &str;

@ -233,6 +231,10 @@ where
        self.as_ref().schema()
    }

+    fn sort_key(&self) -> Option<&SortKey> {
+        self.as_ref().sort_key()
+    }
+
    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        let pred = self.as_ref().delete_predicates();
        debug!(?pred, "Delete predicate in QueryChunkMeta");
@ -251,19 +253,14 @@ where
    chunks.iter().all(|c| c.summary().is_some())
 }

-pub fn compute_sort_key_for_chunks<'a, C>(schema: &'a Schema, chunks: &'a [C]) -> SortKey<'a>
+pub fn compute_sort_key_for_chunks<C>(schema: &Schema, chunks: &[C]) -> SortKey
 where
    C: QueryChunkMeta,
 {
    if !chunks_have_stats(chunks) {
        // chunks have not enough stats, return its  pk that is
        // sorted lexicographically but time column always last
-        let pk = schema.primary_key();
-        let mut sort_key = SortKey::with_capacity(pk.len());
-        for col in pk {
-            sort_key.push(col, Default::default())
-        }
-        sort_key
+        SortKey::from_columns(schema.primary_key())
    } else {
        let summaries = chunks
            .iter()
@ -276,7 +273,7 @@ where
 ///
 /// In the absence of more precise information, this should yield a
 /// good ordering for RLE compression
-pub fn compute_sort_key<'a>(summaries: impl Iterator<Item = &'a TableSummary>) -> SortKey<'a> {
+pub fn compute_sort_key<'a>(summaries: impl Iterator<Item = &'a TableSummary>) -> SortKey {
    let mut cardinalities: HashMap<&str, u64> = Default::default();
    for summary in summaries {
        for column in &summary.columns {
@ -298,11 +295,13 @@ pub fn compute_sort_key<'a>(summaries: impl Iterator<Item = &'a TableSummary>) -
    // Sort by (cardinality, column_name) to have deterministic order if same cardinality
    cardinalities.sort_by_key(|x| (x.1, x.0));

-    let mut key = SortKey::with_capacity(cardinalities.len() + 1);
+    let mut builder = SortKeyBuilder::with_capacity(cardinalities.len() + 1);
    for (col, _) in cardinalities {
-        key.push(col, Default::default())
+        builder = builder.with_col(col)
    }
-    key.push(TIME_COLUMN_NAME, Default::default());
+    builder = builder.with_col(TIME_COLUMN_NAME);
+
+    let key = builder.build();

    trace!(computed_sort_key=?key, "Value of sort key from compute_sort_key");

--- a/query/src/provider.rs
+++ b/query/src/provider.rs
--- a/query/src/test.rs
+++ b/query/src/test.rs
@ -235,6 +235,9 @@ pub struct TestChunk {

    /// Order of this chunk relative to other overlapping chunks.
    order: ChunkOrder,
+
+    /// The sort key of this chunk
+    sort_key: Option<SortKey>,
 }

 /// Implements a method for adding a column with default stats
@ -310,6 +313,7 @@ impl TestChunk {
            predicate_match: Default::default(),
            delete_predicates: Default::default(),
            order: ChunkOrder::MIN,
+            sort_key: None,
        }
    }

@ -858,13 +862,11 @@ impl TestChunk {
    }

    /// Set the sort key for this chunk
-    pub fn with_sort_key(mut self, sort_key: &SortKey<'_>) -> Self {
-        let mut merger = SchemaMerger::new();
-        merger = merger
-            .merge(self.schema.as_ref())
-            .expect("merging was successful");
-        self.schema = Arc::new(merger.build_with_sort_key(sort_key));
-        self
+    pub fn with_sort_key(self, sort_key: SortKey) -> Self {
+        Self {
+            sort_key: Some(sort_key),
+            ..self
+        }
    }

    /// Returns all columns of the table
@ -929,16 +931,6 @@ impl QueryChunk for TestChunk {
        Ok(stream_from_batches(batches))
    }

-    /// Returns true if data of this chunk is sorted
-    fn is_sorted_on_pk(&self) -> bool {
-        false
-    }
-
-    /// Returns the sort key of the chunk if any
-    fn sort_key(&self) -> Option<SortKey<'_>> {
-        None
-    }
-
    fn chunk_type(&self) -> &str {
        "Test Chunk"
    }
@ -999,6 +991,10 @@ impl QueryChunkMeta for TestChunk {
        Arc::clone(&self.schema)
    }

+    fn sort_key(&self) -> Option<&SortKey> {
+        self.sort_key.as_ref()
+    }
+
    // return a reference to delete predicates of the chunk
    fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
        let pred = &self.delete_predicates;
--- a/query/src/util.rs
+++ b/query/src/util.rs
@ -55,22 +55,23 @@ pub fn arrow_pk_sort_exprs(
 }

 pub fn arrow_sort_key_exprs(
-    sort_key: &SortKey<'_>,
+    sort_key: &SortKey,
    input_schema: &ArrowSchema,
 ) -> Vec<PhysicalSortExpr> {
-    let mut sort_exprs = vec![];
-    for (key, options) in sort_key.iter() {
-        let expr = physical_col(key, input_schema).expect("sort key column in schema");
-        sort_exprs.push(PhysicalSortExpr {
-            expr,
-            options: SortOptions {
-                descending: options.descending,
-                nulls_first: options.nulls_first,
-            },
-        });
-    }
-
-    sort_exprs
+    sort_key
+        .iter()
+        .flat_map(|(key, options)| {
+            // Skip over missing columns
+            let expr = physical_col(key, input_schema).ok()?;
+            Some(PhysicalSortExpr {
+                expr,
+                options: SortOptions {
+                    descending: options.descending,
+                    nulls_first: options.nulls_first,
+                },
+            })
+        })
+        .collect()
 }

 /// Build a datafusion physical expression from its logical one
--- a/query_tests/src/table_schema.rs
+++ b/query_tests/src/table_schema.rs
@ -17,6 +17,7 @@ async fn run_table_schema_test_case<D>(
    selection: Selection<'_>,
    table_name: &str,
    expected_schema: Schema,
+    expected_sort_key: Option<&SortKey>,
 ) where
    D: DbSetup,
 {
@ -48,6 +49,8 @@ async fn run_table_schema_test_case<D>(
                    expected_schema,
                    actual_schema
                );
+
+                assert_eq!(chunk.sort_key(), expected_sort_key);
            }
        }
        assert!(
@ -59,7 +62,7 @@ async fn run_table_schema_test_case<D>(

 #[tokio::test]
 async fn list_schema_cpu_all_mub() {
-    // we expect columns to come out in lexographic order by name
+    // we expect columns to come out in lexicographic order by name
    let expected_schema = SchemaBuilder::new()
        .tag("region")
        .timestamp()
@ -72,23 +75,22 @@ async fn list_schema_cpu_all_mub() {
        Selection::All,
        "cpu",
        expected_schema,
+        None,
    )
    .await;
 }

 #[tokio::test]
 async fn list_schema_cpu_all_rub() {
-    // we expect columns to come out in lexographic order by name
+    // we expect columns to come out in lexicographic order by name
    // The schema of RUB includes sort key
-    let mut sort_key = SortKey::with_capacity(2);
-    sort_key.push("region", Default::default());
-    sort_key.push(TIME_COLUMN_NAME, Default::default());
+    let sort_key = SortKey::from_columns(vec!["region", TIME_COLUMN_NAME]);

    let expected_schema = SchemaBuilder::new()
        .tag("region")
        .timestamp()
        .field("user", DataType::Float64)
-        .build_with_sort_key(&sort_key)
+        .build()
        .unwrap();

    run_table_schema_test_case(
@ -96,6 +98,7 @@ async fn list_schema_cpu_all_rub() {
        Selection::All,
        "cpu",
        expected_schema,
+        Some(&sort_key),
    )
    .await;
 }
@ -103,15 +106,13 @@ async fn list_schema_cpu_all_rub() {
 #[tokio::test]
 async fn list_schema_cpu_all_rub_set_sort_key() {
    // The schema of RUB includes sort key
-    let mut sort_key = SortKey::with_capacity(2);
-    sort_key.push("region", Default::default());
-    sort_key.push(TIME_COLUMN_NAME, Default::default());
+    let sort_key = SortKey::from_columns(vec!["region", TIME_COLUMN_NAME]);

    let expected_schema = SchemaBuilder::new()
        .tag("region")
        .timestamp()
        .field("user", DataType::Float64)
-        .build_with_sort_key(&sort_key)
+        .build()
        .unwrap();

    run_table_schema_test_case(
@ -119,6 +120,7 @@ async fn list_schema_cpu_all_rub_set_sort_key() {
        Selection::All,
        "cpu",
        expected_schema,
+        Some(&sort_key),
    )
    .await;

@ -127,7 +129,7 @@ async fn list_schema_cpu_all_rub_set_sort_key() {

 #[tokio::test]
 async fn list_schema_disk_all() {
-    // we expect columns to come out in lexographic order by name
+    // we expect columns to come out in lexicographic order by name
    let expected_schema = SchemaBuilder::new()
        .field("bytes", DataType::Int64)
        .tag("region")
@ -140,6 +142,7 @@ async fn list_schema_disk_all() {
        Selection::All,
        "disk",
        expected_schema,
+        None,
    )
    .await;
 }
@ -160,20 +163,21 @@ async fn list_schema_cpu_selection() {
        selection,
        "cpu",
        expected_schema,
+        None,
    )
    .await;
 }

 #[tokio::test]
 async fn list_schema_disk_selection() {
-    // we expect columns to come out in lexographic order by name
+    // we expect columns to come out in lexicographic order by name
    let expected_schema = SchemaBuilder::new()
        .timestamp()
        .field("bytes", DataType::Int64)
        .build()
        .unwrap();

-    // Pick an order that is not lexographic
+    // Pick an order that is not lexicographic
    let selection = Selection::Some(&["time", "bytes"]);

    run_table_schema_test_case(
@ -181,13 +185,14 @@ async fn list_schema_disk_selection() {
        selection,
        "disk",
        expected_schema,
+        None,
    )
    .await;
 }

 #[tokio::test]
 async fn list_schema_location_all() {
-    // we expect columns to come out in lexographic order by name
+    // we expect columns to come out in lexicographic order by name
    let expected_schema = SchemaBuilder::new()
        .field("count", DataType::UInt64)
        .timestamp()
@ -200,197 +205,7 @@ async fn list_schema_location_all() {
        Selection::All,
        "restaurant",
        expected_schema,
+        None,
    )
    .await;
 }
-
-#[tokio::test]
-async fn test_set_sort_key_valid_same_order() {
-    // Build the expected schema with sort key
-    let mut sort_key = SortKey::with_capacity(3);
-    sort_key.push("tag1", Default::default());
-    sort_key.push("time", Default::default());
-    sort_key.push("tag2", Default::default());
-
-    let expected_schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    // The same schema without sort key
-    let mut schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build()
-        .unwrap();
-
-    schema.set_sort_key(&sort_key);
-
-    assert_eq!(
-        expected_schema, schema,
-        "Schema mismatch \nExpected:\n{:#?}\nActual:\n{:#?}\n",
-        expected_schema, schema
-    );
-}
-
-#[tokio::test]
-async fn test_set_sort_key_valid_different_order() {
-    // Build the expected schema with sort key "time, tag2, tag1"
-    let mut sort_key = SortKey::with_capacity(3);
-    sort_key.push("time", Default::default());
-    sort_key.push("tag2", Default::default());
-    sort_key.push("tag1", Default::default());
-
-    let expected_schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    // The same schema without sort key
-    let mut schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build()
-        .unwrap();
-
-    schema.set_sort_key(&sort_key);
-
-    assert_eq!(
-        expected_schema, schema,
-        "Schema mismatch \nExpected:\n{:#?}\nActual:\n{:#?}\n",
-        expected_schema, schema
-    );
-}
-
-#[tokio::test]
-async fn test_set_sort_key_valid_subset() {
-    // Build the expected schema with sort key "time, tag1"
-    let mut sort_key = SortKey::with_capacity(2);
-    sort_key.push("time", Default::default());
-    sort_key.push("tag1", Default::default());
-
-    let expected_schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    // The same schema without sort key
-    let mut schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build()
-        .unwrap();
-
-    // set sort key for it
-    schema.set_sort_key(&sort_key);
-
-    assert_eq!(
-        expected_schema, schema,
-        "Schema mismatch \nExpected:\n{:#?}\nActual:\n{:#?}\n",
-        expected_schema, schema
-    );
-}
-
-#[tokio::test]
-async fn test_set_sort_key_valid_subset_of_fully_set() {
-    // Build sort key "tag1, time, tag2"
-    let mut sort_key = SortKey::with_capacity(3);
-    sort_key.push("tag1", Default::default());
-    sort_key.push("time", Default::default());
-    sort_key.push("tag2", Default::default());
-
-    // The schema with sort key
-    let mut schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    // reset sort key to "tag2, time"
-    let mut sort_key = SortKey::with_capacity(2);
-    sort_key.push("tag2", Default::default());
-    sort_key.push("time", Default::default());
-
-    schema.set_sort_key(&sort_key);
-
-    // Expected schema with "tag2, time" as sort key
-    let expected_schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    assert_eq!(
-        expected_schema, schema,
-        "Schema mismatch \nExpected:\n{:#?}\nActual:\n{:#?}\n",
-        expected_schema, schema
-    );
-}
-
-#[tokio::test]
-async fn test_set_sort_key_invalid_not_exist() {
-    // Build the expected schema with sort key "time"
-    let mut sort_key = SortKey::with_capacity(1);
-    sort_key.push("time", Default::default());
-
-    let expected_schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build_with_sort_key(&sort_key)
-        .unwrap();
-
-    // The same schema without sort key
-    let mut schema = SchemaBuilder::new()
-        .tag("tag1")
-        .timestamp()
-        .tag("tag2")
-        .field("field_int", DataType::Int64)
-        .field("field_float", DataType::Float64)
-        .build()
-        .unwrap();
-
-    // Nuild sort key that include valid "time" and invalid "no_tag"
-    let mut sort_key = SortKey::with_capacity(2);
-    sort_key.push("time", Default::default());
-    // invalid column
-    sort_key.push("not_tag", Default::default());
-
-    // The invalid key will be ignored in this function
-    schema.set_sort_key(&sort_key);
-
-    assert_eq!(
-        expected_schema, schema,
-        "Schema mismatch \nExpected:\n{:#?}\nActual:\n{:#?}\n",
-        expected_schema, schema
-    );
-}
--- a/schema/src/builder.rs
+++ b/schema/src/builder.rs
@ -3,8 +3,6 @@ use std::convert::TryInto;
 use arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField};
 use snafu::{ResultExt, Snafu};

-use crate::sort::SortKey;
-
 use super::{InfluxColumnType, InfluxFieldType, Schema, TIME_COLUMN_NAME};

 /// Database schema creation / validation errors.
@ -145,20 +143,11 @@ impl SchemaBuilder {
    /// assert_eq!(influxdb_column_type, Some(InfluxColumnType::Timestamp));
    /// ```
    pub fn build(&mut self) -> Result<Schema> {
-        self.build_with_sort_key(&Default::default())
-    }
-
-    pub fn build_with_sort_key(&mut self, sort_key: &SortKey<'_>) -> Result<Schema> {
        assert!(!self.finished, "build called multiple times");
        self.finished = true;

-        Schema::new_from_parts(
-            self.measurement.take(),
-            self.fields.drain(..),
-            sort_key,
-            false,
-        )
-        .context(ValidatingSchemaSnafu)
+        Schema::new_from_parts(self.measurement.take(), self.fields.drain(..), false)
+            .context(ValidatingSchemaSnafu)
    }

    /// Internal helper method to add a column definition
--- a/schema/src/lib.rs
+++ b/schema/src/lib.rs
@ -13,11 +13,10 @@ use arrow::datatypes::{
 };
 use hashbrown::HashSet;

+use crate::sort::SortKey;
 use selection::Selection;
 use snafu::{OptionExt, Snafu};

-use crate::sort::{ColumnSort, SortKey};
-
 /// The name of the timestamp column in the InfluxDB datamodel
 pub const TIME_COLUMN_NAME: &str = "time";

@ -111,7 +110,6 @@ impl TryFrom<ArrowSchemaRef> for Schema {

 const MEASUREMENT_METADATA_KEY: &str = "iox::measurement::name";
 const COLUMN_METADATA_KEY: &str = "iox::column::type";
-const COLUMN_SORT_METADATA_KEY: &str = "iox::column::sort";

 impl Schema {
    /// Create a new Schema wrapper over the schema
@ -161,7 +159,6 @@ impl Schema {
    pub(crate) fn new_from_parts(
        measurement: Option<String>,
        fields: impl Iterator<Item = (ArrowField, Option<InfluxColumnType>)>,
-        sort_key: &SortKey<'_>,
        sort_columns: bool,
    ) -> Result<Self> {
        let mut metadata = HashMap::new();
@ -170,17 +167,9 @@ impl Schema {
            metadata.insert(MEASUREMENT_METADATA_KEY.to_string(), measurement);
        }

-        let mut sort_ordinals = Vec::with_capacity(sort_key.len());
-
        let mut fields: Vec<ArrowField> = fields
            .map(|(mut field, column_type)| {
-                match sort_key.get(field.name()) {
-                    Some(sort) => {
-                        sort_ordinals.push(sort.sort_ordinal);
-                        set_field_metadata(&mut field, column_type, Some(sort))
-                    }
-                    None => set_field_metadata(&mut field, column_type, None),
-                }
+                set_field_metadata(&mut field, column_type);
                field
            })
            .collect();
@ -194,77 +183,14 @@ impl Schema {
        let record =
            ArrowSchemaRef::new(ArrowSchema::new_with_metadata(fields, metadata)).try_into()?;

-        // This must be after validation in case of duplicate columns
-        sort_ordinals.sort_unstable();
-
-        for (idx, ordinal) in sort_ordinals.iter().enumerate() {
-            if idx != *ordinal {
-                return Err(Error::SortColumnNotFound {
-                    column_name: sort_key.get_index(idx).unwrap().0.to_string(),
-                });
-            }
-        }
-
-        if sort_ordinals.len() != sort_key.len() {
-            return Err(Error::SortColumnNotFound {
-                column_name: sort_key
-                    .get_index(sort_ordinals.len())
-                    .unwrap()
-                    .0
-                    .to_string(),
-            });
-        }
-
        Ok(record)
    }

-    /// Set the order of sort columns to the specified `sort_key`
-    pub fn set_sort_key(&mut self, sort_key: &SortKey<'_>) {
-        let fields = self.inner.fields();
-
-        // create a new_fields that are the fields with their sort keys set
-        let new_fields = fields
+    /// Returns true if the sort_key includes all primary key cols
+    pub fn is_sorted_on_pk(&self, sort_key: &SortKey) -> bool {
+        self.primary_key()
            .iter()
-            .map(|field| {
-                let mut new_field = field.clone();
-                let mut meta = std::collections::BTreeMap::new();
-                if let Some(sort) = sort_key.get(field.name()) {
-                    // New sort key
-                    meta.insert(COLUMN_SORT_METADATA_KEY.to_string(), sort.to_string());
-                }
-                // Keep other meta data
-                if let Some(metadata) = field.metadata() {
-                    for (key, value) in metadata {
-                        if key.ne(&COLUMN_SORT_METADATA_KEY.to_string()) {
-                            meta.insert(key.clone(), value.clone());
-                        }
-                    }
-                }
-                new_field.set_metadata(Some(meta));
-
-                new_field
-            })
-            .collect();
-
-        let new_meta = self.inner.metadata().clone();
-        let new_schema = ArrowSchema::new_with_metadata(new_fields, new_meta);
-        self.inner = Arc::new(new_schema);
-    }
-
-    /// Returns true of the sort_key include all primary key cols
-    pub fn is_sorted_on_pk(&self) -> bool {
-        if let Some(sort_key) = self.sort_key() {
-            let key_columns = self.primary_key();
-            for key_col in key_columns {
-                if sort_key.get(key_col).is_none() {
-                    return false; // pk col is not part of the sort key
-                }
-            }
-            true
-        } else {
-            // not sorted yet
-            false
-        }
+            .all(|col| sort_key.get(*col).is_some())
    }

    /// Provide a reference to the underlying Arrow Schema object
@ -425,36 +351,6 @@ impl Schema {
        self.select(Selection::Some(selection))
    }

-    /// Returns the sort key if any
-    pub fn sort_key(&self) -> Option<SortKey<'_>> {
-        // Find all the sorted columns
-        let mut columns: Vec<_> = self
-            .inner
-            .fields()
-            .iter()
-            .enumerate()
-            .flat_map(|(idx, field)| Some((idx, get_sort(field)?)))
-            .collect();
-
-        columns.sort_unstable_by_key(|(_, sort)| sort.sort_ordinal);
-
-        let mut sort_key = SortKey::with_capacity(columns.len());
-        for (idx, (column_idx, sort)) in columns.into_iter().enumerate() {
-            // If the schema has been projected with only some of the columns
-            // the sort key may be truncated
-            if sort.sort_ordinal != idx {
-                break;
-            }
-
-            sort_key.push(self.inner.field(column_idx).name().as_str(), sort.options)
-        }
-
-        if !sort_key.is_empty() {
-            return Some(sort_key);
-        }
-        None
-    }
-
    /// Return columns used for the "primary key" in this table.
    ///
    /// Currently this relies on the InfluxDB data model annotations
@ -501,32 +397,14 @@ pub(crate) fn get_influx_type(field: &ArrowField) -> Option<InfluxColumnType> {
        .ok()
 }

-/// Gets the column sort for a field
-pub(crate) fn get_sort(field: &ArrowField) -> Option<ColumnSort> {
-    field
-        .metadata()
-        .as_ref()?
-        .get(COLUMN_SORT_METADATA_KEY)?
-        .parse()
-        .ok()
-}
-
 /// Sets the metadata for a field - replacing any existing metadata
-pub(crate) fn set_field_metadata(
-    field: &mut ArrowField,
-    column_type: Option<InfluxColumnType>,
-    sort: Option<ColumnSort>,
-) {
+pub(crate) fn set_field_metadata(field: &mut ArrowField, column_type: Option<InfluxColumnType>) {
    let mut metadata = std::collections::BTreeMap::new();

    if let Some(column_type) = column_type {
        metadata.insert(COLUMN_METADATA_KEY.to_string(), column_type.to_string());
    }

-    if let Some(sort) = sort {
-        metadata.insert(COLUMN_SORT_METADATA_KEY.to_string(), sort.to_string());
-    }
-
    field.set_metadata(Some(metadata))
 }

@ -732,13 +610,9 @@ macro_rules! assert_column_eq {

 #[cfg(test)]
 mod test {
-    use arrow::compute::SortOptions;
-
    use InfluxColumnType::*;
    use InfluxFieldType::*;

-    use crate::merge::SchemaMerger;
-
    use super::{builder::SchemaBuilder, *};

    fn make_field(
@ -1158,115 +1032,11 @@ mod test {
        );
    }

-    #[test]
-    fn test_sort() {
-        let mut sort_key = SortKey::with_capacity(3);
-        sort_key.push("tag4", Default::default());
-        sort_key.push("tag3", Default::default());
-        sort_key.push("tag2", Default::default());
-        sort_key.push("tag1", Default::default());
-        sort_key.push(TIME_COLUMN_NAME, Default::default());
-
-        let schema1 = SchemaBuilder::new()
-            .influx_field("the_field", String)
-            .tag("tag1")
-            .tag("tag2")
-            .tag("tag3")
-            .tag("tag4")
-            .timestamp()
-            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
-            .unwrap();
-
-        let projected = schema1
-            .select_by_names(&["tag4", "tag2", "tag3", "time"])
-            .unwrap();
-
-        let projected_key: Vec<_> = projected.sort_key().unwrap().iter().map(|x| *x.0).collect();
-
-        let m1 = SchemaMerger::new().merge(&schema1).unwrap().build();
-
-        let m2 = SchemaMerger::new()
-            .merge(&schema1)
-            .unwrap()
-            .build_with_sort_key(&sort_key);
-
-        assert_eq!(schema1.sort_key().unwrap(), sort_key);
-        assert_eq!(m1.sort_key(), None);
-        assert_eq!(m2.sort_key().unwrap(), sort_key);
-        assert_eq!(projected_key, vec!["tag4", "tag3", "tag2"])
-    }
-
-    #[test]
-    fn test_sort_missing_column() {
-        let mut sort_key = SortKey::with_capacity(3);
-        sort_key.push(
-            "the_field",
-            SortOptions {
-                descending: true,
-                nulls_first: false,
-            },
-        );
-        sort_key.push("a", Default::default());
-        sort_key.push(TIME_COLUMN_NAME, Default::default());
-
-        // Verify missing columns are detected
-        let err = SchemaBuilder::new()
-            .influx_field("the_field", String)
-            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
-            .unwrap_err();
-
-        assert!(matches!(
-            err,
-            builder::Error::ValidatingSchema {
-                source: Error::SortColumnNotFound {
-                    column_name
-                }
-            } if &column_name == "a"
-        ));
-
-        // Verify duplicate columns don't break truncation
-        let err = SchemaBuilder::new()
-            .influx_field("the_field", String)
-            .influx_field("a", String)
-            .timestamp()
-            .timestamp()
-            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
-            .unwrap_err();
-
-        assert!(matches!(
-            err,
-            builder::Error::ValidatingSchema {
-                source: Error::DuplicateColumnName { .. }
-            }
-        ));
-
-        // Verify sort key gaps are detected
-        let err = SchemaBuilder::new()
-            .influx_field("a", String)
-            .influx_field("the_field", String)
-            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
-            .unwrap_err();
-
-        assert!(matches!(err, builder::Error::ValidatingSchema {
-            source: Error::SortColumnNotFound {
-                column_name
-            }
-        } if &column_name == "time" ));
-    }
-
    #[test]
    fn test_is_sort_on_pk() {
        // Sort key the same as pk
-        let mut sort_key = SortKey::with_capacity(3);
-        sort_key.with_col("tag4");
-        sort_key.with_col("tag3");
-        sort_key.with_col("tag2");
-        sort_key.with_col("tag1");
-        sort_key.with_col(TIME_COLUMN_NAME);
+        let sort_key =
+            SortKey::from_columns(vec!["tag4", "tag3", "tag2", "tag1", TIME_COLUMN_NAME]);

        let schema = SchemaBuilder::new()
            .influx_field("the_field", String)
@ -1276,29 +1046,13 @@ mod test {
            .tag("tag4")
            .timestamp()
            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
+            .build()
            .unwrap();
-        assert!(schema.is_sorted_on_pk());
+        assert!(schema.is_sorted_on_pk(&sort_key));

        // Sort key does not include all pk cols
-        let mut sort_key = SortKey::with_capacity(3);
-        sort_key.with_col("tag3");
-        sort_key.with_col("tag1");
-        sort_key.with_col(TIME_COLUMN_NAME);
+        let sort_key = SortKey::from_columns(vec!["tag3", "tag1", TIME_COLUMN_NAME]);

-        let schema = SchemaBuilder::new()
-            .influx_field("the_field", String)
-            .tag("tag1")
-            .tag("tag2")
-            .tag("tag3")
-            .tag("tag4")
-            .timestamp()
-            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
-            .unwrap();
-        assert!(!schema.is_sorted_on_pk());
-
-        // No sort key
        let schema = SchemaBuilder::new()
            .influx_field("the_field", String)
            .tag("tag1")
@ -1309,19 +1063,10 @@ mod test {
            .measurement("the_measurement")
            .build()
            .unwrap();
-        assert!(!schema.is_sorted_on_pk());
-
-        // No PK, no sort key
-        let schema = SchemaBuilder::new()
-            .influx_field("the_field", String)
-            .measurement("the_measurement")
-            .build()
-            .unwrap();
-        assert!(!schema.is_sorted_on_pk());
+        assert!(!schema.is_sorted_on_pk(&sort_key));

        // No PK, sort key on non pk
-        let mut sort_key = SortKey::with_capacity(3);
-        sort_key.with_col("the_field");
+        let sort_key = SortKey::from_columns(vec!["the_field"]);

        let schema = SchemaBuilder::new()
            .influx_field("the_field", String)
@ -1331,8 +1076,8 @@ mod test {
            .tag("tag4")
            .timestamp()
            .measurement("the_measurement")
-            .build_with_sort_key(&sort_key)
+            .build()
            .unwrap();
-        assert!(!schema.is_sorted_on_pk());
+        assert!(!schema.is_sorted_on_pk(&sort_key));
    }
 }
--- a/schema/src/merge.rs
+++ b/schema/src/merge.rs
@ -8,8 +8,6 @@ use hashbrown::hash_map::RawEntryMut;
 use hashbrown::HashMap;
 use snafu::Snafu;

-use crate::sort::SortKey;
-
 use super::{InfluxColumnType, Schema};

 /// Database schema creation / validation errors.
@ -137,7 +135,7 @@ impl SchemaMerger {
        Ok(self)
    }

-    fn merge_field(
+    pub fn merge_field(
        &mut self,
        field: &Field,
        column_type: Option<InfluxColumnType>,
@ -187,18 +185,10 @@ impl SchemaMerger {
    }

    /// Returns the schema that was built, the columns are always sorted in lexicographic order
-    pub fn build(self) -> Schema {
-        self.build_with_sort_key(&Default::default())
-    }
-
-    /// Returns the schema that was built, the columns are always sorted in lexicographic order
-    ///
-    /// Additionally specifies a sort key for the data
-    pub fn build_with_sort_key(mut self, sort_key: &SortKey<'_>) -> Schema {
+    pub fn build(mut self) -> Schema {
        Schema::new_from_parts(
            self.measurement.take(),
            self.fields.drain().map(|x| x.1),
-            sort_key,
            true,
        )
        .expect("failed to build merged schema")
--- a/schema/src/sort.rs
+++ b/schema/src/sort.rs
@ -1,3 +1,4 @@
+use std::sync::Arc;
 use std::{fmt::Display, str::FromStr};

 use arrow::compute::SortOptions;
@ -68,22 +69,79 @@ impl std::fmt::Display for ColumnSort {
    }
 }

-#[derive(Debug, Default, Eq, PartialEq, Clone)]
-pub struct SortKey<'a> {
-    columns: IndexMap<&'a str, SortOptions>,
+#[derive(Debug, Default)]
+pub struct SortKeyBuilder {
+    columns: IndexMap<Arc<str>, SortOptions>,
 }

-impl<'a> SortKey<'a> {
-    /// Create a new empty sort key that can store `capacity` columns without allocating
+impl SortKeyBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            columns: IndexMap::with_capacity(capacity),
        }
    }

-    /// Adds a new column to the end of this sort key
-    pub fn push(&mut self, column: &'a str, options: SortOptions) {
-        self.columns.insert(column, options);
+    pub fn with_col(self, column: impl Into<Arc<str>>) -> Self {
+        self.with_col_sort_opts(column, Default::default())
+    }
+
+    /// Helper to insert col with specified sort options into sort key
+    pub fn with_col_opts(
+        self,
+        col: impl Into<Arc<str>>,
+        descending: bool,
+        nulls_first: bool,
+    ) -> Self {
+        self.with_col_sort_opts(
+            col,
+            SortOptions {
+                descending,
+                nulls_first,
+            },
+        )
+    }
+
+    pub fn with_col_sort_opts(mut self, col: impl Into<Arc<str>>, options: SortOptions) -> Self {
+        self.columns.insert(col.into(), options);
+        self
+    }
+
+    pub fn build(self) -> SortKey {
+        SortKey {
+            columns: Arc::new(self.columns),
+        }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct SortKey {
+    columns: Arc<IndexMap<Arc<str>, SortOptions>>,
+}
+
+impl SortKey {
+    /// Create a new empty sort key
+    pub fn empty() -> Self {
+        SortKey {
+            columns: Default::default(),
+        }
+    }
+
+    /// Create a new sort key from the provided columns
+    pub fn from_columns<C, I>(columns: C) -> Self
+    where
+        C: IntoIterator<Item = I>,
+        I: Into<Arc<str>>,
+    {
+        let iter = columns.into_iter();
+        let mut builder = SortKeyBuilder::with_capacity(iter.size_hint().0);
+        for c in iter {
+            builder = builder.with_col(c);
+        }
+        builder.build()
    }

    /// Gets the ColumnSort for a given column name
@ -96,10 +154,8 @@ impl<'a> SortKey<'a> {
    }

    /// Gets the column for a given index
-    pub fn get_index(&self, idx: usize) -> Option<(&'a str, SortOptions)> {
-        self.columns
-            .get_index(idx)
-            .map(|(col, options)| (*col, *options))
+    pub fn get_index(&self, idx: usize) -> Option<(&Arc<str>, &SortOptions)> {
+        self.columns.get_index(idx)
    }

    /// Return the index of the given column and its sort option. Return None otherwise.
@ -116,7 +172,7 @@ impl<'a> SortKey<'a> {
    }

    /// Returns an iterator over the columns in this key
-    pub fn iter(&self) -> Iter<'_, &'a str, SortOptions> {
+    pub fn iter(&self) -> Iter<'_, Arc<str>, SortOptions> {
        self.columns.iter()
    }

@ -130,23 +186,6 @@ impl<'a> SortKey<'a> {
        self.columns.is_empty()
    }

-    /// Returns a subset of the sort key that includes only the given columns
-    pub fn selected_sort_key(&self, select_keys: Vec<&str>) -> SortKey<'a> {
-        let keys: IndexMap<&'a str, SortOptions> = self
-            .columns
-            .iter()
-            .filter_map(|(col, options)| {
-                if select_keys.iter().any(|key| key == col) {
-                    Some((*col, *options))
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        SortKey { columns: keys }
-    }
-
    /// Returns merge key of the 2 given keys if one covers the other. Returns None otherwise.
    /// Key1 is said to cover key2 if key2 is a subset and in the same order of key1.
    /// Examples:
@ -165,28 +204,19 @@ impl<'a> SortKey<'a> {
    ///        super key of (a, b, c) and any of { b, a), (c, a), (c, b), (b, a, c), (b, c, a), (c, a, b), (c, b, a) } is None
    ///
    ///  Note that the last column in the sort key must be time
-    pub fn try_merge_key(key1: &SortKey<'a>, key2: &SortKey<'a>) -> Option<SortKey<'a>> {
+    pub fn try_merge_key<'a>(key1: &'a SortKey, key2: &'a SortKey) -> Option<&'a SortKey> {
        if key1.is_empty() || key2.is_empty() {
            panic!("Sort key cannot be empty");
        }

-        let key1 = key1.clone();
-        let key2 = key2.clone();
-
        // Verify if time column in the sort key
-        match key1.columns.get_index_of(TIME_COLUMN_NAME) {
-            None => panic!("Time column is not included in the sort key {:#?}", key1),
-            Some(idx) => {
-                if idx < key1.len() - 1 {
-                    panic!("Time column is not last in the sort key {:#?}", key1)
-                }
-            }
-        }
-        match key2.columns.get_index_of(TIME_COLUMN_NAME) {
-            None => panic!("Time column is not included in the sort key {:#?}", key2),
-            Some(idx) => {
-                if idx < key2.len() - 1 {
-                    panic!("Time column is not last in the sort key {:#?}", key2)
+        for key in [&key1, &key2] {
+            match key.columns.get_index_of(TIME_COLUMN_NAME) {
+                None => panic!("Time column is not included in the sort key {:#?}", key),
+                Some(idx) => {
+                    if idx < key.len() - 1 {
+                        panic!("Time column is not last in the sort key {:#?}", key)
+                    }
                }
            }
        }
@ -199,7 +229,7 @@ impl<'a> SortKey<'a> {

        // Go over short key and check its right-order availability in the long key
        let mut prev_long_idx: Option<usize> = None;
-        for (col, sort_options) in &short_key.columns {
+        for (col, sort_options) in short_key.columns.iter() {
            if let Some(long_idx) = long_key.find_index(col, sort_options) {
                match prev_long_idx {
                    None => prev_long_idx = Some(long_idx),
@ -222,29 +252,13 @@ impl<'a> SortKey<'a> {
        // Reach here means the long key is the super key of the sort one
        Some(long_key)
    }
-
-    /// Helper to insert col with default sort options into sort key
-    pub fn with_col(&mut self, col: &'a str) {
-        self.push(col, Default::default());
-    }
-
-    /// Helper to insert col with specified sort options into sort key
-    pub fn with_col_opts(&mut self, col: &'a str, descending: bool, nulls_first: bool) {
-        self.push(
-            col,
-            SortOptions {
-                descending,
-                nulls_first,
-            },
-        );
-    }
 }

 // Produces a human-readable representation of a sort key that looks like:
 //
 //  "host, region DESC, env NULLS FIRST, time"
 //
-impl<'a> Display for SortKey<'a> {
+impl Display for SortKey {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
        for (i, (name, options)) in self.columns.iter().enumerate() {
            write!(f, "{}", name)?;
@ -305,10 +319,7 @@ mod tests {

    #[test]
    fn test_basic() {
-        let mut key = SortKey::with_capacity(3);
-        key.push("a", Default::default());
-        key.push("c", Default::default());
-        key.push("b", Default::default());
+        let key = SortKey::from_columns(vec!["a", "c", "b"]);

        assert_eq!(key.len(), 3);
        assert!(!key.is_empty());
@ -339,48 +350,50 @@ mod tests {

    #[test]
    fn test_sort_key_eq() {
-        let mut key1 = SortKey::with_capacity(1);
-        key1.with_col("a");
+        let key1 = SortKey::from_columns(vec!["a"]);

-        let mut key1_2 = SortKey::with_capacity(2);
-        key1_2.with_col("a");
-        key1_2.with_col_opts("b", true, false);
+        let key1_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("a")
+            .with_col_opts("b", true, false)
+            .build();

-        let key2 = SortKey::with_capacity(2);
+        let key2 = SortKey::empty();

        // different keys
        assert_ne!(key1, key2);
        assert_ne!(key1_2, key2);
        assert_ne!(key1, key1_2);

-        let mut key3 = SortKey::with_capacity(1);
-        key3.with_col("a");
+        let key3 = SortKey::from_columns(vec!["a"]);

-        let mut key3_2 = SortKey::with_capacity(2);
-        key3_2.with_col("a");
-        key3_2.with_col_opts("b", true, false);
+        let key3_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("a")
+            .with_col_opts("b", true, false)
+            .build();

        // same
        assert_eq!(key1, key3);
        assert_eq!(key1_2, key3_2);

-        let mut key4 = SortKey::with_capacity(1);
-        key4.with_col("aa");
+        let key4 = SortKey::from_columns(vec!["aa"]);

-        let mut key4_2 = SortKey::with_capacity(2);
-        key4_2.with_col("aa");
-        key4_2.with_col_opts("bb", true, false);
+        let key4_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("aa")
+            .with_col_opts("bb", true, false)
+            .build();

        // different key, same value
        assert_ne!(key1, key4);
        assert_ne!(key1_2, key4_2);

-        let mut key5 = SortKey::with_capacity(1);
-        key5.with_col_opts("a", true, true);
+        let key5 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts("a", true, true)
+            .build();

-        let mut key5_2 = SortKey::with_capacity(2);
-        key5_2.with_col_opts("a", true, true);
-        key5_2.with_col_opts("b", false, true);
+        let key5_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, true)
+            .with_col_opts("b", false, true)
+            .build();

        // same key, different value
        assert_ne!(key1, key5);
@ -390,39 +403,41 @@ mod tests {
    // Note that the last column must be TIME_COLUMN_NAME to avoid panicking
    #[test]
    fn test_super_sort_key() {
-        // key (a) with default sort options (false, true)
-        let mut key_a = SortKey::with_capacity(1);
        let a = TIME_COLUMN_NAME;
-        key_a.with_col(a);
+        // key (a) with default sort options (false, true)
+        let key_a = SortKey::from_columns(vec![a]);
+
        // key (a) with explicitly defined sort options
-        let mut key_a_2 = SortKey::with_capacity(1);
-        key_a_2.with_col_opts(a, true, false);
+        let key_a_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(TIME_COLUMN_NAME, true, false)
+            .build();

        // super key of (a) and (a) is (a)
        let merge_key = SortKey::try_merge_key(&key_a, &key_a).unwrap();
-        assert_eq!(merge_key, key_a);
+        assert_eq!(merge_key, &key_a);
        let merge_key = SortKey::try_merge_key(&key_a_2, &key_a_2).unwrap();
-        assert_eq!(merge_key, key_a_2);
+        assert_eq!(merge_key, &key_a_2);

        // (a,b)
        let b = TIME_COLUMN_NAME;
-        let mut key_ab = SortKey::with_capacity(2);
-        key_ab.with_col("a");
-        key_ab.with_col(b);
-        let mut key_ab_2 = SortKey::with_capacity(2);
-        key_ab_2.with_col_opts("a", true, false);
-        key_ab_2.with_col_opts(b, false, false);
+        let key_ab = SortKey::from_columns(vec!["a", TIME_COLUMN_NAME]);
+        let key_ab_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, false)
+            .with_col_opts(b, false, false)
+            .build();
+
        //(b)
-        let mut key_b = SortKey::with_capacity(1);
-        key_b.with_col(b);
-        let mut key_b_2 = SortKey::with_capacity(1);
-        key_b_2.with_col_opts(b, false, false);
+        let key_b = SortKey::from_columns(vec![b]);
+
+        let key_b_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(b, false, false)
+            .build();

        // super key of (a, b) and (b) is (a, b)
        let merge_key = SortKey::try_merge_key(&key_ab, &key_b).unwrap();
-        assert_eq!(merge_key, key_ab);
+        assert_eq!(merge_key, &key_ab);
        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_b_2).unwrap();
-        assert_eq!(merge_key, key_ab_2);
+        assert_eq!(merge_key, &key_ab_2);
        // super key of (a, b) and (b') is None
        let merge_key = SortKey::try_merge_key(&key_ab, &key_b_2);
        assert_eq!(merge_key, None);
@ -431,9 +446,9 @@ mod tests {

        // super key of (a, b) and (a, b) is (a, b)
        let merge_key = SortKey::try_merge_key(&key_ab, &key_ab).unwrap();
-        assert_eq!(merge_key, key_ab);
+        assert_eq!(merge_key, &key_ab);
        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_ab_2).unwrap();
-        assert_eq!(merge_key, key_ab_2);
+        assert_eq!(merge_key, &key_ab_2);
        // super key of (a, b) and (a',b') is None
        let merge_key = SortKey::try_merge_key(&key_ab, &key_ab_2);
        assert_eq!(merge_key, None);
@ -442,103 +457,46 @@ mod tests {

        // (a, b, c)
        let c = TIME_COLUMN_NAME;
-        let mut key_abc_2 = SortKey::with_capacity(3);
-        key_abc_2.with_col_opts("a", true, false);
-        key_abc_2.with_col_opts("b", false, false);
-        key_abc_2.with_col_opts(c, true, true);
+        let key_abc_2 = SortKeyBuilder::with_capacity(3)
+            .with_col_opts("a", true, false)
+            .with_col_opts("b", false, false)
+            .with_col_opts(c, true, true)
+            .build();

        //  (c)
-        let mut key_c_2 = SortKey::with_capacity(1);
-        key_c_2.with_col_opts(c, true, true);
+        let key_c_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(c, true, true)
+            .build();

        // (a, c)
-        let mut key_ac_2 = SortKey::with_capacity(2);
-        key_ac_2.with_col_opts("a", true, false);
-        key_ac_2.with_col_opts(c, true, true);
+        let key_ac_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, false)
+            .with_col_opts(c, true, true)
+            .build();

        // (b,c)
-        let mut key_bc_2 = SortKey::with_capacity(2);
-        key_bc_2.with_col_opts("b", false, false);
-        key_bc_2.with_col_opts(c, true, true);
+        let key_bc_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("b", false, false)
+            .with_col_opts(c, true, true)
+            .build();

        // (b,a,c)
-        let mut key_bac_2 = SortKey::with_capacity(3);
-        key_bac_2.with_col_opts("b", false, false);
-        key_bac_2.with_col_opts("a", true, false);
-        key_bac_2.with_col_opts(c, true, true);
+        let key_bac_2 = SortKeyBuilder::with_capacity(3)
+            .with_col_opts("b", false, false)
+            .with_col_opts("a", true, false)
+            .with_col_opts(c, true, true)
+            .build();

        // super key of (a, b, c) and any of {  (a, c), (b, c), (a), (b), (c)  } is (a, b, c)
        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_c_2).unwrap();
-        assert_eq!(merge_key, key_abc_2);
+        assert_eq!(merge_key, &key_abc_2);
        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_ac_2).unwrap();
-        assert_eq!(merge_key, key_abc_2);
+        assert_eq!(merge_key, &key_abc_2);
        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_bc_2).unwrap();
-        assert_eq!(merge_key, key_abc_2);
+        assert_eq!(merge_key, &key_abc_2);

        // super key of (a, b, c) and any of (b, a, c) } is None
        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_bac_2);
        assert_eq!(merge_key, None);
    }
-
-    #[test]
-    fn test_selected_sort_key() {
-        let mut sort_key = SortKey::with_capacity(4);
-        sort_key.with_col("a"); // default sort option
-        sort_key.with_col_opts("b", true, false);
-        sort_key.with_col_opts("c", false, false);
-        sort_key.with_col(TIME_COLUMN_NAME);
-
-        // input cols is empty -> nothing selected
-        let cols = vec![];
-        let selected_key = sort_key.selected_sort_key(cols);
-        assert!(selected_key.is_empty());
-
-        // input cols is not part of the key -> nothing selected
-        let cols = vec!["d", "e"];
-        let selected_key = sort_key.selected_sort_key(cols);
-        assert!(selected_key.is_empty());
-
-        // input cols exactly the same and in the same order -> exact sort_key selected
-        let cols = vec!["a", "b", "c", TIME_COLUMN_NAME];
-        let selected_key = sort_key.selected_sort_key(cols);
-        assert_eq!(selected_key, sort_key);
-
-        // input cols exactly the same but in different order -> exact sort_key selected
-        let cols = vec!["c", TIME_COLUMN_NAME, "b", "a"];
-        let selected_key = sort_key.selected_sort_key(cols);
-        assert_eq!(selected_key, sort_key);
-
-        // input cols is subset but in the same order -> subset selected
-        let cols = vec!["a", "b"];
-        let selected_key = sort_key.selected_sort_key(cols);
-        let mut expected_key = SortKey::with_capacity(2);
-        expected_key.with_col("a"); // default sort option
-        expected_key.with_col_opts("b", true, false);
-        assert_eq!(selected_key, expected_key);
-
-        // input cols is subset but in the same order -> subset selected
-        let cols = vec![TIME_COLUMN_NAME];
-        let selected_key = sort_key.selected_sort_key(cols);
-        let mut expected_key = SortKey::with_capacity(1);
-        expected_key.with_col(TIME_COLUMN_NAME);
-        assert_eq!(selected_key, expected_key);
-
-        // input cols is subset but in the same order with gap -> subset selected
-        let cols = vec!["a", "c", TIME_COLUMN_NAME];
-        let selected_key = sort_key.selected_sort_key(cols);
-        let mut expected_key = SortKey::with_capacity(3);
-        expected_key.with_col("a"); // default sort option
-        expected_key.with_col_opts("c", false, false);
-        expected_key.with_col(TIME_COLUMN_NAME);
-        assert_eq!(selected_key, expected_key);
-
-        // input cols is subset but in different order -> subset in the order with sort_key selected
-        let cols = vec![TIME_COLUMN_NAME, "b", "c"];
-        let selected_key = sort_key.selected_sort_key(cols);
-        let mut expected_key = SortKey::with_capacity(3);
-        expected_key.with_col_opts("b", true, false);
-        expected_key.with_col_opts("c", false, false);
-        expected_key.with_col(TIME_COLUMN_NAME);
-        assert_eq!(selected_key, expected_key);
-    }
 }