2022-01-11 17:51:56 +00:00
|
|
|
[package]
|
|
|
|
name = "iox_catalog"
|
2022-09-26 14:43:00 +00:00
|
|
|
version.workspace = true
|
|
|
|
authors.workspace = true
|
|
|
|
edition.workspace = true
|
|
|
|
license.workspace = true
|
2022-01-11 17:51:56 +00:00
|
|
|
|
|
|
|
[dependencies] # In alphabetical order
|
2023-07-06 09:58:51 +00:00
|
|
|
async-trait = "0.1.71"
|
2022-05-05 19:29:24 +00:00
|
|
|
data_types = { path = "../data_types" }
|
2022-01-11 17:51:56 +00:00
|
|
|
futures = "0.3"
|
2022-05-05 19:29:24 +00:00
|
|
|
iox_time = { version = "0.1.0", path = "../iox_time" }
|
2022-09-19 22:56:05 +00:00
|
|
|
log = "0.4"
|
2022-02-23 10:39:04 +00:00
|
|
|
metric = { version = "0.1.0", path = "../metric" }
|
2022-02-15 16:18:36 +00:00
|
|
|
mutable_batch = { path = "../mutable_batch" }
|
2022-01-11 17:51:56 +00:00
|
|
|
observability_deps = { path = "../observability_deps" }
|
2023-02-06 22:55:14 +00:00
|
|
|
parking_lot = { version = "0.12" }
|
|
|
|
serde = { version = "1.0", features = ["derive"] }
|
refactor: add `parquet_file` PG index for querier (#7842)
* refactor: add `parquet_file` PG index for querier
Currently the `list_by_table_not_to_delete` catalog query is somewhat
expensive:
```text
iox_catalog_prod=> select table_id, sum((to_delete is NULL)::int) as n from parquet_file group by table_id order by n desc limit 5;
table_id | n
----------+------
1489038 | 7221
1489037 | 7019
1491534 | 5793
1491951 | 5522
1513377 | 5339
(5 rows)
iox_catalog_prod=> EXPLAIN ANALYZE SELECT id, namespace_id, table_id, partition_id, object_store_id,
min_time, max_time, to_delete, file_size_bytes,
row_count, compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE table_id = 1489038 AND to_delete IS NULL;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on parquet_file (cost=46050.91..47179.26 rows=283 width=200) (actual time=464.368..472.514 rows=7221 loops=1)
Recheck Cond: ((table_id = 1489038) AND (to_delete IS NULL))
Heap Blocks: exact=7152
-> BitmapAnd (cost=46050.91..46050.91 rows=283 width=0) (actual time=463.341..463.343 rows=0 loops=1)
-> Bitmap Index Scan on parquet_file_table_idx (cost=0.00..321.65 rows=22545 width=0) (actual time=1.674..1.674 rows=7221 loops=1)
Index Cond: (table_id = 1489038)
-> Bitmap Index Scan on parquet_file_deleted_at_idx (cost=0.00..45728.86 rows=1525373 width=0) (actual time=460.717..460.717 rows=4772117 loops=1)
Index Cond: (to_delete IS NULL)
Planning Time: 0.092 ms
Execution Time: 472.907 ms
(10 rows)
```
I think this may also be because PostgreSQL kinda chooses the wrong
strategy, because it could just look at the existing index and filter
from there:
```text
iox_catalog_prod=> EXPLAIN ANALYZE SELECT id, namespace_id, table_id, partition_id, object_store_id,
min_time, max_time, to_delete, file_size_bytes,
row_count, compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE table_id = 1489038;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------
Index Scan using parquet_file_table_idx on parquet_file (cost=0.57..86237.78 rows=22545 width=200) (actual time=0.057..6.994 rows=7221 loops=1)
Index Cond: (table_id = 1489038)
Planning Time: 0.094 ms
Execution Time: 7.297 ms
(4 rows)
```
However PostgreSQL doesn't know the cardinalities well enough. So
let's add a dedicated index to make the querier faster.
* feat: new migration system
* docs: explain dirty migrations
2023-05-31 10:56:32 +00:00
|
|
|
siphasher = "0.3"
|
2022-01-12 23:22:45 +00:00
|
|
|
snafu = "0.7"
|
2023-02-06 22:55:14 +00:00
|
|
|
sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid", "sqlite" ] }
|
2022-02-15 16:18:36 +00:00
|
|
|
sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" }
|
2023-07-07 09:25:12 +00:00
|
|
|
thiserror = "1.0.43"
|
2023-06-28 13:18:08 +00:00
|
|
|
tokio = { version = "1.29", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
|
2022-06-17 10:28:28 +00:00
|
|
|
uuid = { version = "1", features = ["v4"] }
|
2023-02-24 18:02:23 +00:00
|
|
|
workspace-hack = { version = "0.1", path = "../workspace-hack" }
|
2022-01-13 22:10:26 +00:00
|
|
|
|
|
|
|
[dev-dependencies] # In alphabetical order
|
2022-05-05 20:02:16 +00:00
|
|
|
assert_matches = "1.5.0"
|
2023-03-23 02:02:33 +00:00
|
|
|
dotenvy = "0.15.7"
|
2023-05-12 18:45:13 +00:00
|
|
|
generated_types = { path = "../generated_types" }
|
2022-01-27 20:55:18 +00:00
|
|
|
mutable_batch_lp = { path = "../mutable_batch_lp" }
|
2023-07-04 07:57:41 +00:00
|
|
|
paste = "1.0.13"
|
2022-09-01 10:20:26 +00:00
|
|
|
pretty_assertions = "1.3.0"
|
2022-02-17 14:08:25 +00:00
|
|
|
rand = "0.8"
|
2022-02-18 16:42:55 +00:00
|
|
|
tempfile = "3"
|
2023-05-10 17:14:42 +00:00
|
|
|
test_helpers = { path = "../test_helpers" }
|