feat: Last 2 bonus features of remote store get-table (#5991)
* feat: Only get files that aren't already on disk with the reported size * feat: Stream Parquet file bytes to file on disk Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>pull/24376/head
parent
16473f66e7
commit
69a2e6b871
|
@ -5278,6 +5278,7 @@ checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures-core",
|
"futures-core",
|
||||||
|
"futures-io",
|
||||||
"futures-sink",
|
"futures-sink",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
@ -6007,6 +6008,7 @@ dependencies = [
|
||||||
"flatbuffers",
|
"flatbuffers",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
"futures-core",
|
"futures-core",
|
||||||
|
"futures-io",
|
||||||
"futures-sink",
|
"futures-sink",
|
||||||
"futures-task",
|
"futures-task",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
|
|
@ -65,7 +65,7 @@ thiserror = "1.0.37"
|
||||||
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
|
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
|
||||||
tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
|
tokio = { version = "1.21", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
|
||||||
tokio-stream = { version = "0.1", features = ["net"] }
|
tokio-stream = { version = "0.1", features = ["net"] }
|
||||||
tokio-util = { version = "0.7.4" }
|
tokio-util = { version = "0.7.4", features = ["compat"] }
|
||||||
tonic = "0.8"
|
tonic = "0.8"
|
||||||
uuid = { version = "1", features = ["v4"] }
|
uuid = { version = "1", features = ["v4"] }
|
||||||
# jemalloc-sys with unprefixed_malloc_on_supported_platforms feature and heappy are mutually exclusive
|
# jemalloc-sys with unprefixed_malloc_on_supported_platforms feature and heappy are mutually exclusive
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
//! This module implements the `remote store` CLI subcommand
|
//! This module implements the `remote store` CLI subcommand
|
||||||
|
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use futures_util::TryStreamExt;
|
||||||
use influxdb_iox_client::{catalog, connection::Connection, store};
|
use influxdb_iox_client::{catalog, connection::Connection, store};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs::{self, File},
|
fs::{self, File},
|
||||||
io::AsyncWriteExt,
|
io::{self, AsyncWriteExt},
|
||||||
};
|
};
|
||||||
|
use tokio_util::compat::FuturesAsyncReadCompatExt;
|
||||||
|
|
||||||
#[allow(clippy::enum_variant_names)]
|
#[allow(clippy::enum_variant_names)]
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
|
@ -97,23 +99,36 @@ pub async fn command(connection: Connection, config: Config) -> Result<(), Error
|
||||||
.await?;
|
.await?;
|
||||||
let num_parquet_files = parquet_files.len();
|
let num_parquet_files = parquet_files.len();
|
||||||
println!("found {num_parquet_files} Parquet files, downloading...");
|
println!("found {num_parquet_files} Parquet files, downloading...");
|
||||||
let indexed_parquet_file_metadata = parquet_files
|
let indexed_parquet_file_metadata = parquet_files.into_iter().enumerate();
|
||||||
.into_iter()
|
|
||||||
.map(|pf| (pf.object_store_id, pf.partition_id))
|
for (index, parquet_file) in indexed_parquet_file_metadata {
|
||||||
.enumerate();
|
let uuid = parquet_file.object_store_id;
|
||||||
|
let partition_id = parquet_file.partition_id;
|
||||||
|
let file_size_bytes = parquet_file.file_size_bytes as u64;
|
||||||
|
|
||||||
for (index, (uuid, partition_id)) in indexed_parquet_file_metadata {
|
|
||||||
let index = index + 1;
|
let index = index + 1;
|
||||||
let filename = format!("{uuid}.{partition_id}.parquet");
|
let filename = format!("{uuid}.{partition_id}.parquet");
|
||||||
println!("downloading file {index} of {num_parquet_files} ({filename})...");
|
let file_path = directory.join(&filename);
|
||||||
let mut response = store_client
|
|
||||||
.get_parquet_file_by_object_store_id(uuid.clone())
|
|
||||||
.await?;
|
|
||||||
let mut file = File::create(directory.join(&filename)).await?;
|
|
||||||
while let Some(res) = response.next().await {
|
|
||||||
let res = res.unwrap();
|
|
||||||
|
|
||||||
file.write_all(&res.data).await?;
|
if fs::metadata(&file_path)
|
||||||
|
.await
|
||||||
|
.map_or(false, |metadata| metadata.len() == file_size_bytes)
|
||||||
|
{
|
||||||
|
println!(
|
||||||
|
"skipping file {index} of {num_parquet_files} ({filename} already exists)"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
println!("downloading file {index} of {num_parquet_files} ({filename})...");
|
||||||
|
let mut response = store_client
|
||||||
|
.get_parquet_file_by_object_store_id(uuid.clone())
|
||||||
|
.await?
|
||||||
|
.map_ok(|res| res.data)
|
||||||
|
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))
|
||||||
|
.into_async_read()
|
||||||
|
.compat();
|
||||||
|
let mut file = File::create(file_path).await?;
|
||||||
|
|
||||||
|
io::copy(&mut response, &mut file).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
println!("Done.");
|
println!("Done.");
|
||||||
|
|
|
@ -6,6 +6,7 @@ use futures::FutureExt;
|
||||||
use predicates::prelude::*;
|
use predicates::prelude::*;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
use test_helpers_end_to_end::{maybe_skip_integration, MiniCluster, Step, StepTest, StepTestState};
|
use test_helpers_end_to_end::{maybe_skip_integration, MiniCluster, Step, StepTest, StepTestState};
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
/// Get all Parquet files for a table, using the command `remote store get-table`
|
/// Get all Parquet files for a table, using the command `remote store get-table`
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
@ -133,6 +134,46 @@ async fn remote_store_get_table() {
|
||||||
.stderr(predicate::str::contains(
|
.stderr(predicate::str::contains(
|
||||||
"Namespace nacho-namespace not found",
|
"Namespace nacho-namespace not found",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
// Running the same command again shouldn't download any new files
|
||||||
|
Command::cargo_bin("influxdb_iox")
|
||||||
|
.unwrap()
|
||||||
|
.arg("-h")
|
||||||
|
.arg(&router_addr)
|
||||||
|
.arg("remote")
|
||||||
|
.arg("store")
|
||||||
|
.arg("get-table")
|
||||||
|
.arg("-o")
|
||||||
|
.arg(&custom_output_dir)
|
||||||
|
.arg(&namespace)
|
||||||
|
.arg(&other_table_name)
|
||||||
|
.assert()
|
||||||
|
.success()
|
||||||
|
.stdout(predicate::str::contains(format!(
|
||||||
|
"skipping file 1 of 1 ({} already exists)",
|
||||||
|
entries[0].path().file_name().unwrap().to_str().unwrap()
|
||||||
|
)));
|
||||||
|
|
||||||
|
// If the file sizes don't match, re-download that file
|
||||||
|
fs::write(entries[0].path(), b"not parquet").await.unwrap();
|
||||||
|
|
||||||
|
Command::cargo_bin("influxdb_iox")
|
||||||
|
.unwrap()
|
||||||
|
.arg("-h")
|
||||||
|
.arg(&router_addr)
|
||||||
|
.arg("remote")
|
||||||
|
.arg("store")
|
||||||
|
.arg("get-table")
|
||||||
|
.arg("-o")
|
||||||
|
.arg(&custom_output_dir)
|
||||||
|
.arg(&namespace)
|
||||||
|
.arg(&other_table_name)
|
||||||
|
.assert()
|
||||||
|
.success()
|
||||||
|
.stdout(predicate::str::contains(format!(
|
||||||
|
"downloading file 1 of 1 ({})...",
|
||||||
|
entries[0].path().file_name().unwrap().to_str().unwrap()
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
.boxed()
|
.boxed()
|
||||||
})),
|
})),
|
||||||
|
|
|
@ -32,6 +32,7 @@ fixedbitset = { version = "0.4", features = ["std"] }
|
||||||
flatbuffers = { version = "2", features = ["thiserror"] }
|
flatbuffers = { version = "2", features = ["thiserror"] }
|
||||||
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
|
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
|
||||||
futures-core = { version = "0.3", features = ["alloc", "std"] }
|
futures-core = { version = "0.3", features = ["alloc", "std"] }
|
||||||
|
futures-io = { version = "0.3", features = ["std"] }
|
||||||
futures-sink = { version = "0.3", features = ["alloc", "std"] }
|
futures-sink = { version = "0.3", features = ["alloc", "std"] }
|
||||||
futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] }
|
futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] }
|
||||||
futures-util = { version = "0.3", features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] }
|
futures-util = { version = "0.3", features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] }
|
||||||
|
@ -68,7 +69,7 @@ sqlx-core = { version = "0.6", default-features = false, features = ["_rt-tokio"
|
||||||
thrift = { version = "0.16", features = ["log", "server", "threadpool"] }
|
thrift = { version = "0.16", features = ["log", "server", "threadpool"] }
|
||||||
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "parking_lot", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "tracing"] }
|
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "parking_lot", "rt", "rt-multi-thread", "signal", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "tracing"] }
|
||||||
tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }
|
tokio-stream = { version = "0.1", features = ["fs", "net", "time"] }
|
||||||
tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] }
|
tokio-util = { version = "0.7", features = ["codec", "compat", "futures-io", "io", "tracing"] }
|
||||||
tonic = { version = "0.8", features = ["async-trait", "axum", "channel", "codegen", "h2", "hyper", "hyper-timeout", "prost", "prost-derive", "prost1", "tokio", "tower", "tracing-futures", "transport"] }
|
tonic = { version = "0.8", features = ["async-trait", "axum", "channel", "codegen", "h2", "hyper", "hyper-timeout", "prost", "prost-derive", "prost1", "tokio", "tower", "tracing-futures", "transport"] }
|
||||||
tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
|
tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
|
||||||
tower-http = { version = "0.3", features = ["catch-panic", "map-response-body", "tower", "tracing", "util"] }
|
tower-http = { version = "0.3", features = ["catch-panic", "map-response-body", "tower", "tracing", "util"] }
|
||||||
|
@ -96,6 +97,7 @@ either = { version = "1", features = ["use_std"] }
|
||||||
fixedbitset = { version = "0.4", features = ["std"] }
|
fixedbitset = { version = "0.4", features = ["std"] }
|
||||||
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
|
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
|
||||||
futures-core = { version = "0.3", features = ["alloc", "std"] }
|
futures-core = { version = "0.3", features = ["alloc", "std"] }
|
||||||
|
futures-io = { version = "0.3", features = ["std"] }
|
||||||
futures-sink = { version = "0.3", features = ["alloc", "std"] }
|
futures-sink = { version = "0.3", features = ["alloc", "std"] }
|
||||||
futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] }
|
futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] }
|
||||||
futures-util = { version = "0.3", features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] }
|
futures-util = { version = "0.3", features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] }
|
||||||
|
|
Loading…
Reference in New Issue