From f42e0347d7d715014116eec9875e5dffc974a64d Mon Sep 17 00:00:00 2001
From: Scott Anderson <scott@influxdata.com>
Date: Wed, 9 Jul 2025 17:07:59 -0600
Subject: [PATCH 1/6] fix(clustered): updated clustered restore process, closes
 influxdata/DAR#520

---
 .../clustered/admin/backup-restore.md         | 58 ++++++++++++++++---
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index d321c81c3..ec14d5b00 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -171,7 +171,7 @@ INFLUXDB_IOX_DELETE_USING_CATALOG_BACKUP_DATA_SNAPSHOT_FILES: 'true'
 
 After this duration of time, the Garbage Collector deletes _hourly_ snapshots,
 allowing the Garbage Collector to [hard-delete](#hard-delete) Parquet files from the object
-store and the Catalog.  The default is `30d`. The recommended range for snapshots is between
+store and the Catalog. The default is `30d`. The recommended range for snapshots is between
 `1d` and `30d`:
 
 ```yaml
@@ -300,7 +300,7 @@ using Catalog store snapshots:
         kubectl apply --filename myinfluxdb.yml --namespace influxdb
         ```
 
-5.  **Disable InfluxDB Clustered components**
+5.  **Disable all InfluxDB Clustered components _except the Catalog_**
 
     Use the `kubectl scale` command to scale InfluxDB Clustered components down
     to zero replicas:
@@ -313,17 +313,39 @@ using Catalog store snapshots:
     kubectl scale --namespace influxdb --replicas=0 deployment/iox-shared-querier
     kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-compactor
     kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-ingester
-    kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-catalog
     ```
 
     > [!Note]
+    > #### Take note of the number of replicas for each pod
+    >
+    > Take note of the number of replicas you have for each pod before scaling
+    > down to make it easier to bring the cluster back up to scale later in the
+    > restore process (step 8).
+    >
+    > #### Clusters under load may take longer to shut down
+    >
     > If the cluster is under load, some pods may take longer to shut down.
     > For example, Ingester pods must flush their Write-Ahead Logs (WAL) before
     > shutting down.
 
-    Verify that pods have been removed from your cluster.   
+    Verify that all non-Catalog pods have been removed from your cluster.
+    _Once removed_, proceed to the next step.
 
-6.  **Restore the SQL snapshot to the Catalog**
+6.  **Disable the Catalog**
+
+    _After all other pods are removed_, Use the `kubectl scale` command to scale
+    your InfluxDB Clustered Catalog down to zero replicas:
+
+    <!-- pytest.mark.skip -->
+
+    ```bash
+    kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-catalog
+    ```
+
+    Verify that the Catalog pod has been removed from your cluster.
+    _Once removed_, proceed to the next step.
+
+7.  **Restore the SQL snapshot to the Catalog**
 
     Use `psql` to restore the recovery point snapshot to your InfluxDB Catalog. For example:
 
@@ -334,11 +356,29 @@ using Catalog store snapshots:
     ```
 
     The exact `psql` command depends on your PostgreSQL-compatible database
-    provider, their authentication requirements, and the database’s DSN.   
+    provider, their authentication requirements, and the database’s DSN.
 
-7.  **Restart InfluxDB Clustered components**
+8.  **Scale InfluxDB Clustered components back up**
 
-    1.  In your `AppInstance` resource, set `pause` to `false` or remove the `pause`:   
+    Use the `kubectl scale` command to scale your InfluxDB Clustered components
+    back up to their original number of replicas. Perform the scaling operations
+    on components in reverse order--for example:
+
+    <!-- pytest.mark.skip -->
+
+    ```bash
+    kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-catalog
+    kubectl scale --namespace influxdb --replicas=3 statefulset/iox-shared-ingester
+    kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-compactor
+    kubectl scale --namespace influxdb --replicas=2 deployment/iox-shared-querier
+    kubectl scale --namespace influxdb --replicas=1 deployment/global-router
+    kubectl scale --namespace influxdb --replicas=1 deployment/global-gc
+    ```
+
+9.  **Restart the kubit operator**
+
+    1.  In your `AppInstance` resource, set `pause` to `false` or remove the
+        `pause` field:   
 
         ```yaml
         apiVersion: kubecfg.dev/v1alpha1
@@ -355,6 +395,8 @@ using Catalog store snapshots:
         Clustered components to the number of replicas defined for each in your
         `AppInstance` resource:
 
+        <!-- pytest.mark.skip -->
+
         ```bash
         kubectl apply --filename myinfluxdb.yml --namespace influxdb
         ```

From 573c1621dd42a2d9654b53b9a328bf7101593cbf Mon Sep 17 00:00:00 2001
From: Jason Stirnaman <stirnamanj@gmail.com>
Date: Thu, 10 Jul 2025 10:33:18 -0500
Subject: [PATCH 2/6] feat(clustered): backup-restore instructions:- Add
 example commands for checking that replicas are active or shutdown- Add
 suggestion from @reidkaufmann for checking catalog and ingester counts when
 restoring. - Add additional critical notes

---
 .../clustered/admin/backup-restore.md         | 174 ++++++++++++++----
 1 file changed, 137 insertions(+), 37 deletions(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index ec14d5b00..e6eeca7c3 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -302,25 +302,13 @@ using Catalog store snapshots:
 
 5.  **Disable all InfluxDB Clustered components _except the Catalog_**
 
-    Use the `kubectl scale` command to scale InfluxDB Clustered components down
-    to zero replicas:
-
-    <!-- pytest.mark.skip -->
-        
-    ```bash
-    kubectl scale --namespace influxdb --replicas=0 deployment/global-gc
-    kubectl scale --namespace influxdb --replicas=0 deployment/global-router
-    kubectl scale --namespace influxdb --replicas=0 deployment/iox-shared-querier
-    kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-compactor
-    kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-ingester
-    ```
-
-    > [!Note]
-    > #### Take note of the number of replicas for each pod
+    > [!Important]
+    > #### Critical shutdown sequence
     >
-    > Take note of the number of replicas you have for each pod before scaling
-    > down to make it easier to bring the cluster back up to scale later in the
-    > restore process (step 8).
+    > You must scale down components in the correct order and wait for each group
+    > to fully shut down before proceeding. Scaling down the catalog before
+    > ingesters have finished shutting down can cause WAL contents to survive
+    > through the restore, leading to data inconsistency and undefined behavior.
     >
     > #### Clusters under load may take longer to shut down
     >
@@ -328,10 +316,57 @@ using Catalog store snapshots:
     > For example, Ingester pods must flush their Write-Ahead Logs (WAL) before
     > shutting down.
 
-    Verify that all non-Catalog pods have been removed from your cluster.
-    _Once removed_, proceed to the next step.
+    > [!Important]
+    > #### Take note of the number of replicas for each pod
+    >
+    > Before scaling down, record the current number of replicas for each component
+    > to restore them to the correct scale later. Use the following commands:
+    > For example, to get the number of replicas for each component:
+    >
+    > ```bash
+    > echo "GC: $(kubectl get deployment global-gc -n influxdb -o jsonpath='{.spec.replicas}')"
+    > echo "Router: $(kubectl get deployment global-router -n influxdb -o jsonpath='{.spec.replicas}')"
+    > echo "Querier: $(kubectl get deployment iox-shared-querier -n influxdb -o jsonpath='{.spec.replicas}')"
+    > echo "Compactor: $(kubectl get statefulset iox-shared-compactor -n influxdb -o jsonpath='{.spec.replicas}')"
+    > echo "Ingester: $(kubectl get statefulset iox-shared-ingester -n influxdb -o jsonpath='{.spec.replicas}')"
+    > echo "Catalog: $(kubectl get statefulset iox-shared-catalog -n influxdb -o jsonpath='{.spec.replicas}')"
+    > ```
 
-6.  **Disable the Catalog**
+    1. **Scale down non-critical components first**
+
+       Use the `kubectl scale` command to scale these components down to zero replicas:
+
+       <!-- pytest.mark.skip -->
+           
+       ```bash
+       kubectl scale --namespace influxdb --replicas=0 deployment/global-gc
+       kubectl scale --namespace influxdb --replicas=0 deployment/global-router
+       kubectl scale --namespace influxdb --replicas=0 deployment/iox-shared-querier
+       kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-compactor
+       ```
+
+    2. **Scale down ingesters and wait for complete shutdown**
+
+       Scale down the ingesters and wait for all ingester pods to fully shut down:
+
+       <!-- pytest.mark.skip -->
+           
+       ```bash
+       kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-ingester
+       ```
+
+
+       Verify that all non-Catalog pods have been removed from your cluster.
+
+       <!-- pytest.mark.skip -->
+           
+       ```bash
+       kubectl get pods --namespace influxdb --selector=app=iox-shared-ingester
+       ```
+
+       _Once removed_, proceed to the next step.  
+
+6.  **Scale down catalog last**
 
     _After all other pods are removed_, Use the `kubectl scale` command to scale
     your InfluxDB Clustered Catalog down to zero replicas:
@@ -342,12 +377,19 @@ using Catalog store snapshots:
     kubectl scale --namespace influxdb --replicas=0 statefulset/iox-shared-catalog
     ```
 
-    Verify that the Catalog pod has been removed from your cluster.
+    Verify that the Catalog pod has been removed from your cluster:
+
+    <!-- pytest.mark.skip -->
+        
+    ```bash
+    kubectl get pods --namespace influxdb --selector=app=iox-shared-catalog
+    ```
+
     _Once removed_, proceed to the next step.
 
 7.  **Restore the SQL snapshot to the Catalog**
 
-    Use `psql` to restore the recovery point snapshot to your InfluxDB Catalog. For example:
+    Use `psql` to restore the recovery point snapshot to your InfluxDB Catalog--for example:
 
     <!-- pytest.mark.skip -->
 
@@ -360,22 +402,75 @@ using Catalog store snapshots:
 
 8.  **Scale InfluxDB Clustered components back up**
 
+    > [!Important]
+    > **Critical startup sequence**
+    >
+    > When bringing services back online, start components in the correct order
+    > and wait for each critical component group to be fully ready before
+    > proceeding. This prevents temporary errors and ensures a clean startup.
+
     Use the `kubectl scale` command to scale your InfluxDB Clustered components
     back up to their original number of replicas. Perform the scaling operations
-    on components in reverse order--for example:
+    on components _in reverse order of shutdown_.
+
+    > [!Note]
+    > **Recommended startup sequence**
+    >
+    > For optimal cluster initialization and to prevent startup errors, wait for
+    > at least 2 catalog pods to be fully ready, then wait for at least 2 ingester
+    > pods to be fully ready before scaling up the remaining components.
+    
+    1. **Scale catalog and wait for readiness**
+
+       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       <!-- pytest.mark.skip -->
+       
+       ```bash
+       kubectl scale --namespace influxdb --replicas=3 statefulset/iox-shared-catalog
+       kubectl get pods --namespace influxdb --selector=app=iox-shared-catalog --watch
+       ```
+       
+       Wait until at least 2 catalog pods show `Running` status with `2/2` in the READY column.
+       
+    2. **Scale ingesters and wait for readiness**
+
+       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       <!-- pytest.mark.skip -->
+       
+       ```bash
+       kubectl scale --namespace influxdb --replicas=3 statefulset/iox-shared-ingester
+       kubectl get pods --namespace influxdb --selector=app=iox-shared-ingester --watch
+       ```
+       
+       Wait until at least 2 ingester pods show `Running` status and are ready.
+
+    3. **Scale remaining components**
+
+       After you have scaled the catalog and ingesters and verified they are stable, scale the remaining components.
+
+       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       <!-- pytest.mark.skip -->
+
+       ```bash
+       kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-compactor
+       kubectl scale --namespace influxdb --replicas=2 deployment/iox-shared-querier
+       kubectl scale --namespace influxdb --replicas=1 deployment/global-router
+       kubectl scale --namespace influxdb --replicas=1 deployment/global-gc
+       ```
+
+9. **Verify the restore**
+
+    Verify that all InfluxDB Clustered pods are running:
 
     <!-- pytest.mark.skip -->
-
+        
     ```bash
-    kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-catalog
-    kubectl scale --namespace influxdb --replicas=3 statefulset/iox-shared-ingester
-    kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-compactor
-    kubectl scale --namespace influxdb --replicas=2 deployment/iox-shared-querier
-    kubectl scale --namespace influxdb --replicas=1 deployment/global-router
-    kubectl scale --namespace influxdb --replicas=1 deployment/global-gc
+    kubectl get pods --namespace influxdb
     ```
 
-9.  **Restart the kubit operator**
+    All pods should show `Running` status and be ready.
+
+10. **Restart the kubit operator**
 
     1.  In your `AppInstance` resource, set `pause` to `false` or remove the
         `pause` field:   
@@ -391,9 +486,7 @@ using Catalog store snapshots:
         # ...
         ```
 
-    2.  Apply the change to resume the `kubit` operator and scale InfluxDB
-        Clustered components to the number of replicas defined for each in your
-        `AppInstance` resource:
+    2.  Apply the change to resume the `kubit` operator:
 
         <!-- pytest.mark.skip -->
 
@@ -401,12 +494,19 @@ using Catalog store snapshots:
         kubectl apply --filename myinfluxdb.yml --namespace influxdb
         ```
 
-    3.  Verify that InfluxDB Clustered pods start running again.
-
 Your InfluxDB cluster is now restored to the recovery point.
 When the Garbage Collector runs, it identifies what Parquet files are not
 associated with the recovery point and [soft deletes](#soft-delete) them.
 
+> [!Note]
+> **Post-restore verification**
+>
+> After the restore completes, monitor your cluster logs and verify that:
+> - All pods are running and ready
+> - No error messages related to WAL inconsistencies appear
+> - Write and query operations function correctly
+> - The Garbage Collector operates normally
+
 ## Resources
 
 ### prep\_pg\_dump.awk

From 0df51e51d3177c8f9466b6ed273c15796e9272f6 Mon Sep 17 00:00:00 2001
From: Jason Stirnaman <stirnamanj@gmail.com>
Date: Thu, 10 Jul 2025 12:11:21 -0500
Subject: [PATCH 3/6] fix(clustered): note snapshot storage and utilization
 costs:\ \ - Note callout: Focuses on the storage cost implications of
 enabling the setting - Caution callout: Focuses on the risk of making
 recovery points unusable when disabled

---
 .../influxdb3/clustered/admin/backup-restore.md   | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index e6eeca7c3..f85583c7e 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -161,11 +161,16 @@ snapshots. The default is `'false'`. Set to `'true'`:
 INFLUXDB_IOX_DELETE_USING_CATALOG_BACKUP_DATA_SNAPSHOT_FILES: 'true'
 ```
 
+> [!Note]
+> #### Storage utilization and costs
+> 
+> Enabling this setting retains Parquet files referenced in snapshots, increasing
+> object storage utilization and costs. The longer you retain snapshots, the more
+> storage space and costs you incur.
+
 > [!Caution]
-> If set to `false` (the default) with snapshots enabled, the Garbage Collector does not check
-> to see if a Parquet file is associated with existing snapshots before removing
-> the Parquet file from the object store. This could result in deleting Parquet
-> files needed to restore the cluster to a recovery point.
+> If set to `false` (the default), the Garbage Collector may delete Parquet
+> files needed for snapshot restoration, making recovery points unusable.
 
 #### INFLUXDB_IOX_KEEP_HOURLY_CATALOG_BACKUP_FILE_LISTS
 
@@ -454,7 +459,7 @@ using Catalog store snapshots:
        ```bash
        kubectl scale --namespace influxdb --replicas=1 statefulset/iox-shared-compactor
        kubectl scale --namespace influxdb --replicas=2 deployment/iox-shared-querier
-       kubectl scale --namespace influxdb --replicas=1 deployment/global-router
+       kubectl scale --namespace influxdb --replicas=3 deployment/global-router
        kubectl scale --namespace influxdb --replicas=1 deployment/global-gc
        ```
 

From 7d75018a286d73f8e1f8a747e9d510d9154bc533 Mon Sep 17 00:00:00 2001
From: Jason Stirnaman <jstirnaman@influxdata.com>
Date: Thu, 10 Jul 2025 12:18:02 -0500
Subject: [PATCH 4/6] Update
 content/influxdb3/clustered/admin/backup-restore.md

---
 content/influxdb3/clustered/admin/backup-restore.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index f85583c7e..229dc591c 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -326,7 +326,7 @@ using Catalog store snapshots:
     >
     > Before scaling down, record the current number of replicas for each component
     > to restore them to the correct scale later. Use the following commands:
-    > For example, to get the number of replicas for each component:
+    > For example, to get the number of active replicas for each component:
     >
     > ```bash
     > echo "GC: $(kubectl get deployment global-gc -n influxdb -o jsonpath='{.spec.replicas}')"

From 4aba6b74ea075c3038410e197fc15aaa37e0057d Mon Sep 17 00:00:00 2001
From: Jason Stirnaman <jstirnaman@influxdata.com>
Date: Thu, 10 Jul 2025 12:20:48 -0500
Subject: [PATCH 5/6] Update
 content/influxdb3/clustered/admin/backup-restore.md

---
 content/influxdb3/clustered/admin/backup-restore.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index 229dc591c..cda07ee03 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -373,7 +373,7 @@ using Catalog store snapshots:
 
 6.  **Scale down catalog last**
 
-    _After all other pods are removed_, Use the `kubectl scale` command to scale
+    _After all other pods are removed_, use the `kubectl scale` command to scale
     your InfluxDB Clustered Catalog down to zero replicas:
 
     <!-- pytest.mark.skip -->

From 55be385c5e2c3f61017a24185b41ff8117cb0179 Mon Sep 17 00:00:00 2001
From: Jason Stirnaman <stirnamanj@gmail.com>
Date: Thu, 10 Jul 2025 12:46:40 -0500
Subject: [PATCH 6/6] fix(clustered): revise callouts and add expand for
 optional list command

---
 .../clustered/admin/backup-restore.md         | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/content/influxdb3/clustered/admin/backup-restore.md b/content/influxdb3/clustered/admin/backup-restore.md
index cda07ee03..4333cda04 100644
--- a/content/influxdb3/clustered/admin/backup-restore.md
+++ b/content/influxdb3/clustered/admin/backup-restore.md
@@ -321,21 +321,20 @@ using Catalog store snapshots:
     > For example, Ingester pods must flush their Write-Ahead Logs (WAL) before
     > shutting down.
 
-    > [!Important]
-    > #### Take note of the number of replicas for each pod
-    >
-    > Before scaling down, record the current number of replicas for each component
-    > to restore them to the correct scale later. Use the following commands:
-    > For example, to get the number of active replicas for each component:
-    >
-    > ```bash
-    > echo "GC: $(kubectl get deployment global-gc -n influxdb -o jsonpath='{.spec.replicas}')"
-    > echo "Router: $(kubectl get deployment global-router -n influxdb -o jsonpath='{.spec.replicas}')"
-    > echo "Querier: $(kubectl get deployment iox-shared-querier -n influxdb -o jsonpath='{.spec.replicas}')"
-    > echo "Compactor: $(kubectl get statefulset iox-shared-compactor -n influxdb -o jsonpath='{.spec.replicas}')"
-    > echo "Ingester: $(kubectl get statefulset iox-shared-ingester -n influxdb -o jsonpath='{.spec.replicas}')"
-    > echo "Catalog: $(kubectl get statefulset iox-shared-catalog -n influxdb -o jsonpath='{.spec.replicas}')"
-    > ```
+    1. Before scaling down, record the current number of replicas for each component
+       to restore them to the correct scale later. 
+       {{< expand-wrapper >}}
+       {{% expand "Get the number of replicas for each pod" %}}
+  ```bash
+  echo "GC: $(kubectl get deployment global-gc -n influxdb -o jsonpath='{.spec.replicas}')"
+  echo "Router: $(kubectl get deployment global-router -n influxdb -o jsonpath='{.spec.replicas}')"
+  echo "Querier: $(kubectl get deployment iox-shared-querier -n influxdb -o jsonpath='{.spec.replicas}')"
+  echo "Compactor: $(kubectl get statefulset iox-shared-compactor -n influxdb -o jsonpath='{.spec.replicas}')"
+  echo "Ingester: $(kubectl get statefulset iox-shared-ingester -n influxdb -o jsonpath='{.spec.replicas}')"
+  echo "Catalog: $(kubectl get statefulset iox-shared-catalog -n influxdb -o jsonpath='{.spec.replicas}')"
+  ```
+       {{% /expand %}}
+       {{< /expand-wrapper >}}
 
     1. **Scale down non-critical components first**
 
@@ -427,7 +426,7 @@ using Catalog store snapshots:
     
     1. **Scale catalog and wait for readiness**
 
-       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       _Replace the number of replicas with the [original values](#get-the-number-of-replicas-for-each-pod) you noted when scaling down._
        <!-- pytest.mark.skip -->
        
        ```bash
@@ -439,7 +438,7 @@ using Catalog store snapshots:
        
     2. **Scale ingesters and wait for readiness**
 
-       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       _Replace the number of replicas with the [original values](#get-the-number-of-replicas-for-each-pod) you noted when scaling down._
        <!-- pytest.mark.skip -->
        
        ```bash
@@ -453,7 +452,7 @@ using Catalog store snapshots:
 
        After you have scaled the catalog and ingesters and verified they are stable, scale the remaining components.
 
-       _Replace the number of replicas with the original values you noted when [scaling down](#take-note-of-the-number-of-replicas-for-each-pod)._
+       _Replace the number of replicas with the [original values](#get-the-number-of-replicas-for-each-pod) you noted when scaling down._
        <!-- pytest.mark.skip -->
 
        ```bash