Source code for airflow.providers.amazon.aws.operators.redshift_cluster

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import time
from collections.abc import Sequence
from datetime import timedelta
from typing import TYPE_CHECKING, Any

from airflow.configuration import conf
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.providers.amazon.aws.hooks.redshift_cluster import RedshiftHook
from airflow.providers.amazon.aws.triggers.redshift_cluster import (
    RedshiftCreateClusterSnapshotTrigger,
    RedshiftCreateClusterTrigger,
    RedshiftDeleteClusterTrigger,
    RedshiftPauseClusterTrigger,
    RedshiftResumeClusterTrigger,
)
from airflow.providers.amazon.aws.utils import validate_execute_complete_event
from airflow.utils.helpers import prune_dict

if TYPE_CHECKING:
    from airflow.utils.context import Context



[docs]
class RedshiftCreateClusterOperator(BaseOperator):
    """
    Creates a new cluster with the specified parameters.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftCreateClusterOperator`

    :param cluster_identifier:  A unique identifier for the cluster.
    :param node_type: The node type to be provisioned for the cluster.
            Valid Values: ``ds2.xlarge``, ``ds2.8xlarge``, ``dc1.large``,
            ``dc1.8xlarge``, ``dc2.large``, ``dc2.8xlarge``, ``ra3.xlplus``,
            ``ra3.4xlarge``, and ``ra3.16xlarge``.
    :param master_username: The username associated with the admin user account for
        the cluster that is being created.
    :param master_user_password: The password associated with the admin user account for
        the cluster that is being created.
    :param cluster_type: The type of the cluster ``single-node`` or ``multi-node``.
        The default value is ``multi-node``.
    :param db_name: The name of the first database to be created when the cluster is created.
    :param number_of_nodes: The number of compute nodes in the cluster.
        This param require when ``cluster_type`` is ``multi-node``.
    :param cluster_security_groups: A list of security groups to be associated with this cluster.
    :param vpc_security_group_ids: A list of  VPC security groups to be associated with the cluster.
    :param cluster_subnet_group_name: The name of a cluster subnet group to be associated with this cluster.
    :param availability_zone: The EC2 Availability Zone (AZ).
    :param preferred_maintenance_window: The time range (in UTC) during which automated cluster
        maintenance can occur.
    :param cluster_parameter_group_name: The name of the parameter group to be associated with this cluster.
    :param automated_snapshot_retention_period: The number of days that automated snapshots are retained.
        The default value is ``1``.
    :param manual_snapshot_retention_period: The default number of days to retain a manual snapshot.
    :param port: The port number on which the cluster accepts incoming connections.
        The Default value is ``5439``.
    :param cluster_version: The version of a Redshift engine software that you want to deploy on the cluster.
    :param allow_version_upgrade: Whether major version upgrades can be applied during the maintenance window.
        The Default value is ``True``.
    :param publicly_accessible: Whether cluster can be accessed from a public network.
    :param encrypted: Whether data in the cluster is encrypted at rest.
        The default value is ``False``.
    :param hsm_client_certificate_identifier: Name of the HSM client certificate
        the Amazon Redshift cluster uses to retrieve the data.
    :param hsm_configuration_identifier: Name of the HSM configuration
    :param elastic_ip: The Elastic IP (EIP) address for the cluster.
    :param tags: A list of tag instances
    :param kms_key_id: KMS key id of encryption key.
    :param enhanced_vpc_routing: Whether to create the cluster with enhanced VPC routing enabled
        Default value is ``False``.
    :param additional_info: Reserved
    :param iam_roles: A list of IAM roles that can be used by the cluster to access other AWS services.
    :param maintenance_track_name: Name of the maintenance track for the cluster.
    :param snapshot_schedule_identifier: A  unique identifier for the snapshot schedule.
    :param availability_zone_relocation: Enable relocation for a Redshift cluster
        between Availability Zones after the cluster is created.
    :param aqua_configuration_status: The cluster is configured to use AQUA .
    :param default_iam_role_arn: ARN for the IAM role.
    :param aws_conn_id: str | None = The Airflow connection used for AWS credentials.
        The default connection id is ``aws_default``.
    :param wait_for_completion: Whether wait for the cluster to be in ``available`` state
    :param max_attempt: The maximum number of attempts to be made. Default: 5
    :param poll_interval: The amount of time in seconds to wait between attempts. Default: 60
    :param deferrable: If True, the operator will run in deferrable mode
    """


[docs]
    template_fields: Sequence[str] = (
        "cluster_identifier",
        "cluster_type",
        "node_type",
        "master_username",
        "master_user_password",
        "cluster_type",
        "db_name",
        "number_of_nodes",
        "cluster_security_groups",
        "vpc_security_group_ids",
        "cluster_subnet_group_name",
        "availability_zone",
        "preferred_maintenance_window",
        "cluster_parameter_group_name",
        "automated_snapshot_retention_period",
        "manual_snapshot_retention_period",
        "port",
        "cluster_version",
        "allow_version_upgrade",
        "publicly_accessible",
        "encrypted",
        "hsm_client_certificate_identifier",
        "hsm_configuration_identifier",
        "elastic_ip",
        "tags",
        "kms_key_id",
        "enhanced_vpc_routing",
        "additional_info",
        "iam_roles",
        "maintenance_track_name",
        "snapshot_schedule_identifier",
        "availability_zone_relocation",
        "aqua_configuration_status",
        "default_iam_role_arn",
    )


[docs]
    ui_color = "#eeaa11"


[docs]
    ui_fgcolor = "#ffffff"


    def __init__(
        self,
        *,
        cluster_identifier: str,
        node_type: str,
        master_username: str,
        master_user_password: str,
        cluster_type: str = "multi-node",
        db_name: str = "dev",
        number_of_nodes: int = 1,
        cluster_security_groups: list[str] | None = None,
        vpc_security_group_ids: list[str] | None = None,
        cluster_subnet_group_name: str | None = None,
        availability_zone: str | None = None,
        preferred_maintenance_window: str | None = None,
        cluster_parameter_group_name: str | None = None,
        automated_snapshot_retention_period: int = 1,
        manual_snapshot_retention_period: int | None = None,
        port: int = 5439,
        cluster_version: str = "1.0",
        allow_version_upgrade: bool = True,
        publicly_accessible: bool = True,
        encrypted: bool = False,
        hsm_client_certificate_identifier: str | None = None,
        hsm_configuration_identifier: str | None = None,
        elastic_ip: str | None = None,
        tags: list[Any] | None = None,
        kms_key_id: str | None = None,
        enhanced_vpc_routing: bool = False,
        additional_info: str | None = None,
        iam_roles: list[str] | None = None,
        maintenance_track_name: str | None = None,
        snapshot_schedule_identifier: str | None = None,
        availability_zone_relocation: bool | None = None,
        aqua_configuration_status: str | None = None,
        default_iam_role_arn: str | None = None,
        aws_conn_id: str | None = "aws_default",
        wait_for_completion: bool = False,
        max_attempt: int = 5,
        poll_interval: int = 60,
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.node_type = node_type


[docs]
        self.master_username = master_username


[docs]
        self.master_user_password = master_user_password


[docs]
        self.cluster_type = cluster_type


[docs]
        self.db_name = db_name


[docs]
        self.number_of_nodes = number_of_nodes


[docs]
        self.cluster_security_groups = cluster_security_groups


[docs]
        self.vpc_security_group_ids = vpc_security_group_ids


[docs]
        self.cluster_subnet_group_name = cluster_subnet_group_name


[docs]
        self.availability_zone = availability_zone


[docs]
        self.preferred_maintenance_window = preferred_maintenance_window


[docs]
        self.cluster_parameter_group_name = cluster_parameter_group_name


[docs]
        self.automated_snapshot_retention_period = automated_snapshot_retention_period


[docs]
        self.manual_snapshot_retention_period = manual_snapshot_retention_period


[docs]
        self.port = port


[docs]
        self.cluster_version = cluster_version


[docs]
        self.allow_version_upgrade = allow_version_upgrade


[docs]
        self.publicly_accessible = publicly_accessible


[docs]
        self.encrypted = encrypted


[docs]
        self.hsm_client_certificate_identifier = hsm_client_certificate_identifier


[docs]
        self.hsm_configuration_identifier = hsm_configuration_identifier


[docs]
        self.elastic_ip = elastic_ip


[docs]
        self.tags = tags


[docs]
        self.kms_key_id = kms_key_id


[docs]
        self.enhanced_vpc_routing = enhanced_vpc_routing


[docs]
        self.additional_info = additional_info


[docs]
        self.iam_roles = iam_roles


[docs]
        self.maintenance_track_name = maintenance_track_name


[docs]
        self.snapshot_schedule_identifier = snapshot_schedule_identifier


[docs]
        self.availability_zone_relocation = availability_zone_relocation


[docs]
        self.aqua_configuration_status = aqua_configuration_status


[docs]
        self.default_iam_role_arn = default_iam_role_arn


[docs]
        self.aws_conn_id = aws_conn_id


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.max_attempt = max_attempt


[docs]
        self.poll_interval = poll_interval


[docs]
        self.deferrable = deferrable


[docs]
        self.kwargs = kwargs



[docs]
    def execute(self, context: Context):
        redshift_hook = RedshiftHook(aws_conn_id=self.aws_conn_id)
        self.log.info("Creating Redshift cluster %s", self.cluster_identifier)
        params: dict[str, Any] = {}
        if self.db_name:
            params["DBName"] = self.db_name
        if self.cluster_type:
            params["ClusterType"] = self.cluster_type
            if self.cluster_type == "multi-node":
                params["NumberOfNodes"] = self.number_of_nodes
        if self.cluster_security_groups:
            params["ClusterSecurityGroups"] = self.cluster_security_groups
        if self.vpc_security_group_ids:
            params["VpcSecurityGroupIds"] = self.vpc_security_group_ids
        if self.cluster_subnet_group_name:
            params["ClusterSubnetGroupName"] = self.cluster_subnet_group_name
        if self.availability_zone:
            params["AvailabilityZone"] = self.availability_zone
        if self.preferred_maintenance_window:
            params["PreferredMaintenanceWindow"] = self.preferred_maintenance_window
        if self.cluster_parameter_group_name:
            params["ClusterParameterGroupName"] = self.cluster_parameter_group_name
        if self.automated_snapshot_retention_period:
            params["AutomatedSnapshotRetentionPeriod"] = self.automated_snapshot_retention_period
        if self.manual_snapshot_retention_period:
            params["ManualSnapshotRetentionPeriod"] = self.manual_snapshot_retention_period
        if self.port:
            params["Port"] = self.port
        if self.cluster_version:
            params["ClusterVersion"] = self.cluster_version
        if self.allow_version_upgrade:
            params["AllowVersionUpgrade"] = self.allow_version_upgrade
        if self.encrypted:
            params["Encrypted"] = self.encrypted
        if self.hsm_client_certificate_identifier:
            params["HsmClientCertificateIdentifier"] = self.hsm_client_certificate_identifier
        if self.hsm_configuration_identifier:
            params["HsmConfigurationIdentifier"] = self.hsm_configuration_identifier
        if self.elastic_ip:
            params["ElasticIp"] = self.elastic_ip
        if self.tags:
            params["Tags"] = self.tags
        if self.kms_key_id:
            params["KmsKeyId"] = self.kms_key_id
        if self.enhanced_vpc_routing:
            params["EnhancedVpcRouting"] = self.enhanced_vpc_routing
        if self.additional_info:
            params["AdditionalInfo"] = self.additional_info
        if self.iam_roles:
            params["IamRoles"] = self.iam_roles
        if self.maintenance_track_name:
            params["MaintenanceTrackName"] = self.maintenance_track_name
        if self.snapshot_schedule_identifier:
            params["SnapshotScheduleIdentifier"] = self.snapshot_schedule_identifier
        if self.availability_zone_relocation:
            params["AvailabilityZoneRelocation"] = self.availability_zone_relocation
        if self.aqua_configuration_status:
            params["AquaConfigurationStatus"] = self.aqua_configuration_status
        if self.default_iam_role_arn:
            params["DefaultIamRoleArn"] = self.default_iam_role_arn

        # PubliclyAccessible is True by default on Redshift side, hence, we should always set it regardless
        # of its value
        params["PubliclyAccessible"] = self.publicly_accessible

        cluster = redshift_hook.create_cluster(
            self.cluster_identifier,
            self.node_type,
            self.master_username,
            self.master_user_password,
            params,
        )
        if self.deferrable:
            self.defer(
                trigger=RedshiftCreateClusterTrigger(
                    cluster_identifier=self.cluster_identifier,
                    waiter_delay=self.poll_interval,
                    waiter_max_attempts=self.max_attempt,
                    aws_conn_id=self.aws_conn_id,
                ),
                method_name="execute_complete",
            )
        if self.wait_for_completion:
            redshift_hook.get_conn().get_waiter("cluster_available").wait(
                ClusterIdentifier=self.cluster_identifier,
                WaiterConfig={
                    "Delay": self.poll_interval,
                    "MaxAttempts": self.max_attempt,
                },
            )

        self.log.info("Created Redshift cluster %s", self.cluster_identifier)
        self.log.info(cluster)



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
        validated_event = validate_execute_complete_event(event)

        if validated_event["status"] != "success":
            raise AirflowException(f"Error creating cluster: {validated_event}")





[docs]
class RedshiftCreateClusterSnapshotOperator(BaseOperator):
    """
    Creates a manual snapshot of the specified cluster. The cluster must be in the available state.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftCreateClusterSnapshotOperator`

    :param snapshot_identifier: A unique identifier for the snapshot that you are requesting
    :param cluster_identifier: The cluster identifier for which you want a snapshot
    :param retention_period: The number of days that a manual snapshot is retained.
        If the value is -1, the manual snapshot is retained indefinitely.
    :param tags: A list of tag instances
    :param wait_for_completion: Whether wait for the cluster snapshot to be in ``available`` state
    :param poll_interval: Time (in seconds) to wait between two consecutive calls to check state
    :param max_attempt: The maximum number of attempts to be made to check the state
    :param aws_conn_id: The Airflow connection used for AWS credentials.
        The default connection id is ``aws_default``
    :param deferrable: If True, the operator will run as a deferrable operator.
    """


[docs]
    template_fields: Sequence[str] = (
        "cluster_identifier",
        "snapshot_identifier",
    )


    def __init__(
        self,
        *,
        snapshot_identifier: str,
        cluster_identifier: str,
        retention_period: int = -1,
        tags: list[Any] | None = None,
        wait_for_completion: bool = False,
        poll_interval: int = 15,
        max_attempt: int = 20,
        aws_conn_id: str | None = "aws_default",
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.snapshot_identifier = snapshot_identifier


[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.retention_period = retention_period


[docs]
        self.tags = tags


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.poll_interval = poll_interval


[docs]
        self.max_attempt = max_attempt


[docs]
        self.deferrable = deferrable


[docs]
        self.aws_conn_id = aws_conn_id


[docs]
        self.redshift_hook = RedshiftHook(aws_conn_id=aws_conn_id)



[docs]
    def execute(self, context: Context) -> Any:
        cluster_state = self.redshift_hook.cluster_status(cluster_identifier=self.cluster_identifier)
        if cluster_state != "available":
            raise AirflowException(
                "Redshift cluster must be in available state. "
                f"Redshift cluster current state is {cluster_state}"
            )

        self.redshift_hook.create_cluster_snapshot(
            cluster_identifier=self.cluster_identifier,
            snapshot_identifier=self.snapshot_identifier,
            retention_period=self.retention_period,
            tags=self.tags,
        )
        if self.deferrable:
            self.defer(
                trigger=RedshiftCreateClusterSnapshotTrigger(
                    cluster_identifier=self.cluster_identifier,
                    waiter_delay=self.poll_interval,
                    waiter_max_attempts=self.max_attempt,
                    aws_conn_id=self.aws_conn_id,
                ),
                method_name="execute_complete",
                # timeout is set to ensure that if a trigger dies, the timeout does not restart
                # 60 seconds is added to allow the trigger to exit gracefully (i.e. yield TriggerEvent)
                timeout=timedelta(seconds=self.max_attempt * self.poll_interval + 60),
            )

        if self.wait_for_completion:
            self.redshift_hook.get_conn().get_waiter("snapshot_available").wait(
                ClusterIdentifier=self.cluster_identifier,
                WaiterConfig={
                    "Delay": self.poll_interval,
                    "MaxAttempts": self.max_attempt,
                },
            )



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
        validated_event = validate_execute_complete_event(event)

        if validated_event["status"] != "success":
            raise AirflowException(f"Error creating snapshot: {validated_event}")

        self.log.info("Cluster snapshot created.")





[docs]
class RedshiftDeleteClusterSnapshotOperator(BaseOperator):
    """
    Deletes the specified manual snapshot.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftDeleteClusterSnapshotOperator`

    :param snapshot_identifier: A unique identifier for the snapshot that you are requesting
    :param cluster_identifier: The unique identifier of the cluster the snapshot was created from
    :param wait_for_completion: Whether wait for cluster deletion or not
        The default value is ``True``
    :param aws_conn_id: The Airflow connection used for AWS credentials.
        The default connection id is ``aws_default``
    :param poll_interval: Time (in seconds) to wait between two consecutive calls to check snapshot state
    """


[docs]
    template_fields: Sequence[str] = (
        "cluster_identifier",
        "snapshot_identifier",
    )


    def __init__(
        self,
        *,
        snapshot_identifier: str,
        cluster_identifier: str,
        wait_for_completion: bool = True,
        aws_conn_id: str | None = "aws_default",
        poll_interval: int = 10,
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.snapshot_identifier = snapshot_identifier


[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.poll_interval = poll_interval


[docs]
        self.redshift_hook = RedshiftHook(aws_conn_id=aws_conn_id)



[docs]
    def execute(self, context: Context) -> Any:
        self.redshift_hook.get_conn().delete_cluster_snapshot(
            SnapshotClusterIdentifier=self.cluster_identifier,
            SnapshotIdentifier=self.snapshot_identifier,
        )

        if self.wait_for_completion:
            while self.get_status() is not None:
                time.sleep(self.poll_interval)



[docs]
    def get_status(self) -> str:
        return self.redshift_hook.get_cluster_snapshot_status(
            snapshot_identifier=self.snapshot_identifier,
        )





[docs]
class RedshiftResumeClusterOperator(BaseOperator):
    """
    Resume a paused AWS Redshift Cluster.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftResumeClusterOperator`

    :param cluster_identifier:  Unique identifier of the AWS Redshift cluster
    :param aws_conn_id: The Airflow connection used for AWS credentials.
        The default connection id is ``aws_default``
    :param poll_interval: Time (in seconds) to wait between two consecutive calls to check cluster state
    :param max_attempts: The maximum number of attempts to check the state of the cluster.
    :param wait_for_completion: If True, the operator will wait for the cluster to be in the
        `resumed` state. Default is False.
    :param deferrable: If True, the operator will run as a deferrable operator.
    """


[docs]
    template_fields: Sequence[str] = ("cluster_identifier",)


[docs]
    ui_color = "#eeaa11"


[docs]
    ui_fgcolor = "#ffffff"


    def __init__(
        self,
        *,
        cluster_identifier: str,
        aws_conn_id: str | None = "aws_default",
        wait_for_completion: bool = False,
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        poll_interval: int = 30,
        max_attempts: int = 30,
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.aws_conn_id = aws_conn_id


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.deferrable = deferrable


[docs]
        self.max_attempts = max_attempts


[docs]
        self.poll_interval = poll_interval

        # These parameters are used to address an issue with the boto3 API where the API
        # prematurely reports the cluster as available to receive requests. This causes the cluster
        # to reject initial attempts to resume the cluster despite reporting the correct state.
        self._remaining_attempts = 10
        self._attempt_interval = 15


[docs]
    def execute(self, context: Context):
        redshift_hook = RedshiftHook(aws_conn_id=self.aws_conn_id)
        self.log.info("Starting resume cluster")
        while self._remaining_attempts:
            try:
                redshift_hook.get_conn().resume_cluster(ClusterIdentifier=self.cluster_identifier)
                break
            except redshift_hook.get_conn().exceptions.InvalidClusterStateFault as error:
                self._remaining_attempts -= 1

                if self._remaining_attempts:
                    self.log.error(
                        "Unable to resume cluster. %d attempts remaining.", self._remaining_attempts
                    )
                    time.sleep(self._attempt_interval)
                else:
                    raise error

        if self.wait_for_completion:
            if self.deferrable:
                cluster_state = redshift_hook.cluster_status(cluster_identifier=self.cluster_identifier)
                if cluster_state == "available":
                    self.log.info("Resumed cluster successfully")
                elif cluster_state == "deleting":
                    raise AirflowException(
                        "Unable to resume cluster since cluster is currently in status: %s", cluster_state
                    )
                else:
                    self.defer(
                        trigger=RedshiftResumeClusterTrigger(
                            cluster_identifier=self.cluster_identifier,
                            waiter_delay=self.poll_interval,
                            waiter_max_attempts=self.max_attempts,
                            aws_conn_id=self.aws_conn_id,
                        ),
                        method_name="execute_complete",
                        # timeout is set to ensure that if a trigger dies, the timeout does not restart
                        # 60 seconds is added to allow the trigger to exit gracefully (i.e. yield TriggerEvent)
                        timeout=timedelta(seconds=self.max_attempts * self.poll_interval + 60),
                    )
            else:
                waiter = redshift_hook.get_waiter("cluster_resumed")
                waiter.wait(
                    ClusterIdentifier=self.cluster_identifier,
                    WaiterConfig={
                        "Delay": self.poll_interval,
                        "MaxAttempts": self.max_attempts,
                    },
                )



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
        validated_event = validate_execute_complete_event(event)

        if validated_event["status"] != "success":
            raise AirflowException(f"Error resuming cluster: {validated_event}")
        self.log.info("Resumed cluster successfully")





[docs]
class RedshiftPauseClusterOperator(BaseOperator):
    """
    Pause an AWS Redshift Cluster if it has status `available`.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftPauseClusterOperator`

    :param cluster_identifier: id of the AWS Redshift Cluster
    :param aws_conn_id: The Airflow connection used for AWS credentials.
        If this is None or empty then the default boto3 behaviour is used. If
        running Airflow in a distributed manner and aws_conn_id is None or
        empty, then default boto3 configuration would be used (and must be
        maintained on each worker node).
    :param wait_for_completion: If True, waits for the cluster to be paused. (default: False)
    :param deferrable: Run operator in the deferrable mode
    :param poll_interval: Time (in seconds) to wait between two consecutive calls to check cluster state
    :param max_attempts: Maximum number of attempts to poll the cluster
    """


[docs]
    template_fields: Sequence[str] = ("cluster_identifier",)


[docs]
    ui_color = "#eeaa11"


[docs]
    ui_fgcolor = "#ffffff"


    def __init__(
        self,
        *,
        cluster_identifier: str,
        aws_conn_id: str | None = "aws_default",
        wait_for_completion: bool = False,
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        poll_interval: int = 30,
        max_attempts: int = 30,
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.aws_conn_id = aws_conn_id


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.deferrable = deferrable


[docs]
        self.max_attempts = max_attempts


[docs]
        self.poll_interval = poll_interval

        # These parameters are used to address an issue with the boto3 API where the API
        # prematurely reports the cluster as available to receive requests. This causes the cluster
        # to reject initial attempts to pause the cluster despite reporting the correct state.
        self._remaining_attempts = 10
        self._attempt_interval = 15


[docs]
    def execute(self, context: Context):
        redshift_hook = RedshiftHook(aws_conn_id=self.aws_conn_id)
        while self._remaining_attempts:
            try:
                redshift_hook.get_conn().pause_cluster(ClusterIdentifier=self.cluster_identifier)
                break
            except redshift_hook.get_conn().exceptions.InvalidClusterStateFault as error:
                self._remaining_attempts -= 1

                if self._remaining_attempts:
                    self.log.error(
                        "Unable to pause cluster. %d attempts remaining.", self._remaining_attempts
                    )
                    time.sleep(self._attempt_interval)
                else:
                    raise error
        if self.wait_for_completion:
            if self.deferrable:
                cluster_state = redshift_hook.cluster_status(cluster_identifier=self.cluster_identifier)
                if cluster_state == "paused":
                    self.log.info("Paused cluster successfully")
                elif cluster_state == "deleting":
                    raise AirflowException(
                        f"Unable to pause cluster since cluster is currently in status: {cluster_state}"
                    )
                else:
                    self.defer(
                        trigger=RedshiftPauseClusterTrigger(
                            cluster_identifier=self.cluster_identifier,
                            waiter_delay=self.poll_interval,
                            waiter_max_attempts=self.max_attempts,
                            aws_conn_id=self.aws_conn_id,
                        ),
                        method_name="execute_complete",
                        # timeout is set to ensure that if a trigger dies, the timeout does not restart
                        # 60 seconds is added to allow the trigger to exit gracefully (i.e. yield TriggerEvent)
                        timeout=timedelta(seconds=self.max_attempts * self.poll_interval + 60),
                    )
            else:
                waiter = redshift_hook.get_waiter("cluster_paused")
                waiter.wait(
                    ClusterIdentifier=self.cluster_identifier,
                    WaiterConfig=prune_dict(
                        {
                            "Delay": self.poll_interval,
                            "MaxAttempts": self.max_attempts,
                        }
                    ),
                )



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
        validated_event = validate_execute_complete_event(event)

        if validated_event["status"] != "success":
            raise AirflowException(f"Error pausing cluster: {validated_event}")
        self.log.info("Paused cluster successfully")





[docs]
class RedshiftDeleteClusterOperator(BaseOperator):
    """
    Delete an AWS Redshift cluster.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftDeleteClusterOperator`

    :param cluster_identifier: unique identifier of a cluster
    :param skip_final_cluster_snapshot: determines cluster snapshot creation
    :param final_cluster_snapshot_identifier: name of final cluster snapshot
    :param wait_for_completion: Whether wait for cluster deletion or not
        The default value is ``True``
    :param aws_conn_id: The Airflow connection used for AWS credentials.
        If this is None or empty then the default boto3 behaviour is used. If
        running Airflow in a distributed manner and aws_conn_id is None or
        empty, then default boto3 configuration would be used (and must be
        maintained on each worker node).
    :param poll_interval: Time (in seconds) to wait between two consecutive calls to check cluster state
    :param deferrable: Run operator in the deferrable mode.
    :param max_attempts: (Deferrable mode only) The maximum number of attempts to be made
    """


[docs]
    template_fields: Sequence[str] = ("cluster_identifier",)


[docs]
    ui_color = "#eeaa11"


[docs]
    ui_fgcolor = "#ffffff"


    def __init__(
        self,
        *,
        cluster_identifier: str,
        skip_final_cluster_snapshot: bool = True,
        final_cluster_snapshot_identifier: str | None = None,
        wait_for_completion: bool = True,
        aws_conn_id: str | None = "aws_default",
        poll_interval: int = 30,
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        max_attempts: int = 30,
        **kwargs,
    ):
        super().__init__(**kwargs)

[docs]
        self.cluster_identifier = cluster_identifier


[docs]
        self.skip_final_cluster_snapshot = skip_final_cluster_snapshot


[docs]
        self.final_cluster_snapshot_identifier = final_cluster_snapshot_identifier


[docs]
        self.wait_for_completion = wait_for_completion


[docs]
        self.poll_interval = poll_interval

        # These parameters are added to keep trying if there is a running operation in the cluster
        # If there is a running operation in the cluster while trying to delete it, a InvalidClusterStateFault
        # is thrown. In such case, retrying
        self._attempts = 10
        self._attempt_interval = 15

[docs]
        self.redshift_hook = RedshiftHook(aws_conn_id=aws_conn_id)


[docs]
        self.aws_conn_id = aws_conn_id


[docs]
        self.deferrable = deferrable


[docs]
        self.max_attempts = max_attempts



[docs]
    def execute(self, context: Context):
        while self._attempts:
            try:
                self.redshift_hook.delete_cluster(
                    cluster_identifier=self.cluster_identifier,
                    skip_final_cluster_snapshot=self.skip_final_cluster_snapshot,
                    final_cluster_snapshot_identifier=self.final_cluster_snapshot_identifier,
                )
                break
            except self.redshift_hook.conn.exceptions.InvalidClusterStateFault:
                self._attempts -= 1

                if self._attempts:
                    current_state = self.redshift_hook.conn.describe_clusters(
                        ClusterIdentifier=self.cluster_identifier
                    )["Clusters"][0]["ClusterStatus"]
                    self.log.error(
                        "Cluster in %s state, unable to delete. %d attempts remaining.",
                        current_state,
                        self._attempts,
                    )
                    time.sleep(self._attempt_interval)
                else:
                    raise

        if self.deferrable:
            cluster_state = self.redshift_hook.cluster_status(cluster_identifier=self.cluster_identifier)
            if cluster_state == "cluster_not_found":
                self.log.info("Cluster deleted successfully")
            elif cluster_state in ("creating", "modifying"):
                raise AirflowException(
                    f"Unable to delete cluster since cluster is currently in status: {cluster_state}"
                )
            else:
                self.defer(
                    timeout=timedelta(seconds=self.max_attempts * self.poll_interval + 60),
                    trigger=RedshiftDeleteClusterTrigger(
                        cluster_identifier=self.cluster_identifier,
                        waiter_delay=self.poll_interval,
                        waiter_max_attempts=self.max_attempts,
                        aws_conn_id=self.aws_conn_id,
                    ),
                    method_name="execute_complete",
                )

        elif self.wait_for_completion:
            waiter = self.redshift_hook.conn.get_waiter("cluster_deleted")
            waiter.wait(
                ClusterIdentifier=self.cluster_identifier,
                WaiterConfig={"Delay": self.poll_interval, "MaxAttempts": self.max_attempts},
            )



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
        validated_event = validate_execute_complete_event(event)

        if validated_event["status"] != "success":
            raise AirflowException(f"Error deleting cluster: {validated_event}")

        self.log.info("Cluster deleted successfully")