Source code for airflow.providers.amazon.aws.hooks.glue

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import asyncio
import time
import warnings
from functools import cached_property
from typing import Any

from botocore.exceptions import ClientError

from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
from airflow.providers.amazon.aws.hooks.logs import AwsLogsHook


[docs]
DEFAULT_LOG_SUFFIX = "output"


[docs]
ERROR_LOG_SUFFIX = "error"




[docs]
class GlueJobHook(AwsBaseHook):
    """
    Interact with AWS Glue.

    Provide thick wrapper around :external+boto3:py:class:`boto3.client("glue") <Glue.Client>`.

    :param s3_bucket: S3 bucket where logs and local etl script will be uploaded
    :param job_name: unique job name per AWS account
    :param desc: job description
    :param concurrent_run_limit: The maximum number of concurrent runs allowed for a job
    :param script_location: path to etl script on s3
    :param retry_limit: Maximum number of times to retry this job if it fails
    :param num_of_dpus: Number of AWS Glue DPUs to allocate to this Job
    :param region_name: aws region name (example: us-east-1)
    :param iam_role_name: AWS IAM Role for Glue Job Execution. If set `iam_role_arn` must equal None.
    :param iam_role_arn: AWS IAM Role ARN for Glue Job Execution, If set `iam_role_name` must equal None.
    :param create_job_kwargs: Extra arguments for Glue Job Creation
    :param update_config: Update job configuration on Glue (default: False)

    Additional arguments (such as ``aws_conn_id``) may be specified and
    are passed down to the underlying AwsBaseHook.

    .. seealso::
        - :class:`airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook`
    """


[docs]
    class LogContinuationTokens:
        """Used to hold the continuation tokens when reading logs from both streams Glue Jobs write to."""

        def __init__(self):

[docs]
            self.output_stream_continuation: str | None = None


[docs]
            self.error_stream_continuation: str | None = None



    def __init__(
        self,
        s3_bucket: str | None = None,
        job_name: str | None = None,
        desc: str | None = None,
        concurrent_run_limit: int = 1,
        script_location: str | None = None,
        retry_limit: int = 0,
        num_of_dpus: int | float | None = None,
        iam_role_name: str | None = None,
        iam_role_arn: str | None = None,
        create_job_kwargs: dict | None = None,
        update_config: bool = False,
        job_poll_interval: int | float = 6,
        *args,
        **kwargs,
    ):

[docs]
        self.job_name = job_name


[docs]
        self.desc = desc


[docs]
        self.concurrent_run_limit = concurrent_run_limit


[docs]
        self.script_location = script_location


[docs]
        self.retry_limit = retry_limit


[docs]
        self.s3_bucket = s3_bucket


[docs]
        self.role_name = iam_role_name


[docs]
        self.role_arn = iam_role_arn


[docs]
        self.s3_glue_logs = "logs/glue-logs/"


[docs]
        self.create_job_kwargs = create_job_kwargs or {}


[docs]
        self.update_config = update_config


[docs]
        self.job_poll_interval = job_poll_interval


        worker_type_exists = "WorkerType" in self.create_job_kwargs
        num_workers_exists = "NumberOfWorkers" in self.create_job_kwargs

        if self.role_arn and self.role_name:
            raise ValueError("Cannot set iam_role_arn and iam_role_name simultaneously")
        if worker_type_exists and num_workers_exists:
            if num_of_dpus is not None:
                raise ValueError("Cannot specify num_of_dpus with custom WorkerType")
        elif not worker_type_exists and num_workers_exists:
            raise ValueError("Need to specify custom WorkerType when specifying NumberOfWorkers")
        elif worker_type_exists and not num_workers_exists:
            raise ValueError("Need to specify NumberOfWorkers when specifying custom WorkerType")
        elif num_of_dpus is None:
            self.num_of_dpus: int | float = 10
        else:
            self.num_of_dpus = num_of_dpus

        kwargs["client_type"] = "glue"
        super().__init__(*args, **kwargs)


[docs]
    def create_glue_job_config(self) -> dict:
        default_command = {
            "Name": "glueetl",
            "ScriptLocation": self.script_location,
        }
        command = self.create_job_kwargs.pop("Command", default_command)
        if not self.role_arn:
            execution_role = self.get_iam_execution_role()
            role_arn = execution_role["Role"]["Arn"]
        else:
            role_arn = self.role_arn

        config = {
            "Name": self.job_name,
            "Description": self.desc,
            "Role": role_arn,
            "ExecutionProperty": {"MaxConcurrentRuns": self.concurrent_run_limit},
            "Command": command,
            "MaxRetries": self.retry_limit,
            **self.create_job_kwargs,
        }

        if hasattr(self, "num_of_dpus"):
            config["MaxCapacity"] = self.num_of_dpus

        if self.s3_bucket is not None:
            config["LogUri"] = f"s3://{self.s3_bucket}/{self.s3_glue_logs}{self.job_name}"

        return config



[docs]
    def describe_jobs(self) -> list:
        """
        Get list of Jobs.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.get_jobs`
        """
        return self.conn.get_jobs()



[docs]
    def list_jobs(self) -> list:
        """
        Get list of Jobs.

        .. deprecated::
            - Use :meth:`describe_jobs` instead.
        """
        warnings.warn(
            "The method `list_jobs` is deprecated. Use the method `describe_jobs` instead.",
            AirflowProviderDeprecationWarning,
            stacklevel=2,
        )
        return self.describe_jobs()



[docs]
    def get_iam_execution_role(self) -> dict:
        try:
            iam_client = self.get_session(region_name=self.region_name).client(
                "iam", endpoint_url=self.conn_config.endpoint_url, config=self.config, verify=self.verify
            )
            glue_execution_role = iam_client.get_role(RoleName=self.role_name)
            self.log.info("Iam Role Name: %s", self.role_name)
            return glue_execution_role
        except Exception as general_error:
            self.log.error("Failed to create aws glue job, error: %s", general_error)
            raise



[docs]
    def initialize_job(
        self,
        script_arguments: dict | None = None,
        run_kwargs: dict | None = None,
    ) -> dict[str, str]:
        """
        Initialize connection with AWS Glue to run job.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.start_job_run`
        """
        script_arguments = script_arguments or {}
        run_kwargs = run_kwargs or {}

        try:
            if self.update_config:
                job_name = self.create_or_update_glue_job()
            else:
                job_name = self.get_or_create_glue_job()

            return self.conn.start_job_run(JobName=job_name, Arguments=script_arguments, **run_kwargs)
        except Exception as general_error:
            self.log.error("Failed to run aws glue job, error: %s", general_error)
            raise



[docs]
    def get_job_state(self, job_name: str, run_id: str) -> str:
        """
        Get state of the Glue job; the job state can be running, finished, failed, stopped or timeout.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.get_job_run`

        :param job_name: unique job name per AWS account
        :param run_id: The job-run ID of the predecessor job run
        :return: State of the Glue job
        """
        job_run = self.conn.get_job_run(JobName=job_name, RunId=run_id, PredecessorsIncluded=True)
        return job_run["JobRun"]["JobRunState"]



[docs]
    async def async_get_job_state(self, job_name: str, run_id: str) -> str:
        """
        Get state of the Glue job; the job state can be running, finished, failed, stopped or timeout.

        The async version of get_job_state.
        """
        async with await self.get_async_conn() as client:
            job_run = await client.get_job_run(JobName=job_name, RunId=run_id)
        return job_run["JobRun"]["JobRunState"]


    @cached_property

[docs]
    def logs_hook(self):
        """Returns an AwsLogsHook instantiated with the parameters of the GlueJobHook."""
        return AwsLogsHook(
            aws_conn_id=self.aws_conn_id, region_name=self.region_name, verify=self.verify, config=self.config
        )



[docs]
    def print_job_logs(
        self,
        job_name: str,
        run_id: str,
        continuation_tokens: LogContinuationTokens,
    ):
        """
        Print the latest job logs to the Airflow task log and updates the continuation tokens.

        :param continuation_tokens: the tokens where to resume from when reading logs.
            The object gets updated with the new tokens by this method.
        """
        log_client = self.logs_hook.get_conn()
        paginator = log_client.get_paginator("filter_log_events")
        job_run = self.conn.get_job_run(JobName=job_name, RunId=run_id)["JobRun"]
        # StartTime needs to be an int and is Epoch time in milliseconds
        start_time = int(job_run["StartedOn"].timestamp() * 1000)

        def display_logs_from(log_group: str, continuation_token: str | None) -> str | None:
            """Mutualize iteration over the 2 different log streams glue jobs write to."""
            fetched_logs = []
            next_token = continuation_token
            try:
                for response in paginator.paginate(
                    logGroupName=log_group,
                    logStreamNames=[run_id],
                    startTime=start_time,
                    PaginationConfig={"StartingToken": continuation_token},
                ):
                    fetched_logs.extend([event["message"] for event in response["events"]])
                    # if the response is empty there is no nextToken in it
                    next_token = response.get("nextToken") or next_token
            except ClientError as e:
                if e.response["Error"]["Code"] == "ResourceNotFoundException":
                    # we land here when the log groups/streams don't exist yet
                    self.log.warning(
                        "No new Glue driver logs so far.\n"
                        "If this persists, check the CloudWatch dashboard at: %r.",
                        f"https://{self.conn_region_name}.console.aws.amazon.com/cloudwatch/home",
                    )
                else:
                    raise

            if len(fetched_logs):
                # Add a tab to indent those logs and distinguish them from airflow logs.
                # Log lines returned already contain a newline character at the end.
                messages = "\t".join(fetched_logs)
                self.log.info("Glue Job Run %s Logs:\n\t%s", log_group, messages)
            else:
                self.log.info("No new log from the Glue Job in %s", log_group)
            return next_token

        log_group_prefix = job_run["LogGroupName"]
        log_group_default = f"{log_group_prefix}/{DEFAULT_LOG_SUFFIX}"
        log_group_error = f"{log_group_prefix}/{ERROR_LOG_SUFFIX}"
        # one would think that the error log group would contain only errors, but it actually contains
        # a lot of interesting logs too, so it's valuable to have both
        continuation_tokens.output_stream_continuation = display_logs_from(
            log_group_default, continuation_tokens.output_stream_continuation
        )
        continuation_tokens.error_stream_continuation = display_logs_from(
            log_group_error, continuation_tokens.error_stream_continuation
        )



[docs]
    def job_completion(
        self, job_name: str, run_id: str, verbose: bool = False, sleep_before_return: int = 0
    ) -> dict[str, str]:
        """
        Wait until Glue job with job_name finishes; return final state if finished or raises AirflowException.

        :param job_name: unique job name per AWS account
        :param run_id: The job-run ID of the predecessor job run
        :param verbose: If True, more Glue Job Run logs show in the Airflow Task Logs.  (default: False)
        :param sleep_before_return: time in seconds to wait before returning final status.
        :return: Dict of JobRunState and JobRunId
        """
        next_log_tokens = self.LogContinuationTokens()
        while True:
            job_run_state = self.get_job_state(job_name, run_id)
            ret = self._handle_state(job_run_state, job_name, run_id, verbose, next_log_tokens)
            if ret:
                time.sleep(sleep_before_return)
                return ret
            time.sleep(self.job_poll_interval)



[docs]
    async def async_job_completion(self, job_name: str, run_id: str, verbose: bool = False) -> dict[str, str]:
        """
        Wait until Glue job with job_name finishes; return final state if finished or raises AirflowException.

        :param job_name: unique job name per AWS account
        :param run_id: The job-run ID of the predecessor job run
        :param verbose: If True, more Glue Job Run logs show in the Airflow Task Logs.  (default: False)
        :return: Dict of JobRunState and JobRunId
        """
        next_log_tokens = self.LogContinuationTokens()
        while True:
            job_run_state = await self.async_get_job_state(job_name, run_id)
            ret = self._handle_state(job_run_state, job_name, run_id, verbose, next_log_tokens)
            if ret:
                return ret
            await asyncio.sleep(self.job_poll_interval)


    def _handle_state(
        self,
        state: str,
        job_name: str,
        run_id: str,
        verbose: bool,
        next_log_tokens: GlueJobHook.LogContinuationTokens,
    ) -> dict | None:
        """Process Glue Job state while polling; used by both sync and async methods."""
        failed_states = ["FAILED", "TIMEOUT"]
        finished_states = ["SUCCEEDED", "STOPPED"]

        if verbose:
            self.print_job_logs(
                job_name=job_name,
                run_id=run_id,
                continuation_tokens=next_log_tokens,
            )

        if state in finished_states:
            self.log.info("Exiting Job %s Run State: %s", run_id, state)
            return {"JobRunState": state, "JobRunId": run_id}
        if state in failed_states:
            job_error_message = f"Exiting Job {run_id} Run State: {state}"
            self.log.info(job_error_message)
            raise AirflowException(job_error_message)
        self.log.info(
            "Polling for AWS Glue Job %s current run state with status %s",
            job_name,
            state,
        )
        return None


[docs]
    def has_job(self, job_name) -> bool:
        """
        Check if the job already exists.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.get_job`

        :param job_name: unique job name per AWS account
        :return: Returns True if the job already exists and False if not.
        """
        self.log.info("Checking if job already exists: %s", job_name)

        try:
            self.conn.get_job(JobName=job_name)
            return True
        except self.conn.exceptions.EntityNotFoundException:
            return False



[docs]
    def update_job(self, **job_kwargs) -> bool:
        """
        Update job configurations.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.update_job`

        :param job_kwargs: Keyword args that define the configurations used for the job
        :return: True if job was updated and false otherwise
        """
        job_name = job_kwargs.pop("Name")
        current_job = self.conn.get_job(JobName=job_name)["Job"]

        update_config = {
            key: value for key, value in job_kwargs.items() if current_job.get(key) != job_kwargs[key]
        }
        if update_config != {}:
            self.log.info("Updating job: %s", job_name)
            self.conn.update_job(JobName=job_name, JobUpdate=job_kwargs)
            self.log.info("Updated configurations: %s", update_config)
            return True
        return False



[docs]
    def get_or_create_glue_job(self) -> str | None:
        """
        Get (or creates) and returns the Job name.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.create_job`

        :return:Name of the Job
        """
        if self.has_job(self.job_name):
            return self.job_name

        config = self.create_glue_job_config()
        self.log.info("Creating job: %s", self.job_name)
        self.conn.create_job(**config)

        return self.job_name



[docs]
    def create_or_update_glue_job(self) -> str | None:
        """
        Create (or update) and return the Job name.

        .. seealso::
            - :external+boto3:py:meth:`Glue.Client.update_job`
            - :external+boto3:py:meth:`Glue.Client.create_job`

        :return:Name of the Job
        """
        config = self.create_glue_job_config()

        if self.has_job(self.job_name):
            self.update_job(**config)
        else:
            self.log.info("Creating job: %s", self.job_name)
            self.conn.create_job(**config)

        return self.job_name





[docs]
class GlueDataQualityHook(AwsBaseHook):
    """
    Interact with AWS Glue Data Quality.

    Provide thick wrapper around :external+boto3:py:class:`boto3.client("glue") <Glue.Client>`.

    Additional arguments (such as ``aws_conn_id``) may be specified and
    are passed down to the underlying AwsBaseHook.

    .. seealso::
        - :class:`airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook`
    """

    def __init__(
        self,
        *args,
        **kwargs,
    ):
        kwargs["client_type"] = "glue"
        super().__init__(*args, **kwargs)


[docs]
    def has_data_quality_ruleset(self, name: str) -> bool:
        try:
            self.conn.get_data_quality_ruleset(Name=name)
            return True
        except self.conn.exceptions.EntityNotFoundException:
            return False


    def _log_results(self, result: dict[str, Any]) -> None:
        """
        Print the outcome of evaluation run, An evaluation run can involve multiple rulesets evaluated against a data source (Glue table).

        Name    Description                                     Result        EvaluatedMetrics                                                                    EvaluationMessage
        Rule_1    RowCount between 150000 and 600000             PASS        {'Dataset.*.RowCount': 300000.0}                                                       NaN
        Rule_2    IsComplete "marketplace"                       PASS        {'Column.marketplace.Completeness': 1.0}                                               NaN
        Rule_3    ColumnLength "marketplace" between 1 and 2     FAIL        {'Column.marketplace.MaximumLength': 9.0, 'Column.marketplace.MinimumLength': 3.0}     Value: 9.0 does not meet the constraint requirement!

        """
        import pandas as pd

        pd.set_option("display.max_rows", None)
        pd.set_option("display.max_columns", None)
        pd.set_option("display.width", None)
        pd.set_option("display.max_colwidth", None)

        self.log.info(
            "AWS Glue data quality ruleset evaluation result for RulesetName: %s RulesetEvaluationRunId: %s Score: %s",
            result.get("RulesetName"),
            result.get("RulesetEvaluationRunId"),
            result.get("Score"),
        )

        rule_results = result["RuleResults"]
        rule_results_df = pd.DataFrame(rule_results)
        self.log.info(rule_results_df)


[docs]
    def get_evaluation_run_results(self, run_id: str) -> dict[str, Any]:
        response = self.conn.get_data_quality_ruleset_evaluation_run(RunId=run_id)

        return self.conn.batch_get_data_quality_result(ResultIds=response["ResultIds"])



[docs]
    def validate_evaluation_run_results(
        self, evaluation_run_id: str, show_results: bool = True, verify_result_status: bool = True
    ) -> None:
        results = self.get_evaluation_run_results(evaluation_run_id)
        total_failed_rules = 0

        if results.get("ResultsNotFound"):
            self.log.info(
                "AWS Glue data quality ruleset evaluation run, results not found for %s",
                results["ResultsNotFound"],
            )

        for result in results["Results"]:
            rule_results = result["RuleResults"]

            total_failed_rules += len(
                list(
                    filter(
                        lambda result: result.get("Result") == "FAIL" or result.get("Result") == "ERROR",
                        rule_results,
                    )
                )
            )

            if show_results:
                self._log_results(result)

        self.log.info(
            "AWS Glue data quality ruleset evaluation run, total number of rules failed: %s",
            total_failed_rules,
        )

        if verify_result_status and total_failed_rules > 0:
            raise AirflowException(
                "AWS Glue data quality ruleset evaluation run failed for one or more rules"
            )



[docs]
    def log_recommendation_results(self, run_id: str) -> None:
        """
        Print the outcome of recommendation run, recommendation run generates multiple rules against a data source (Glue table) in Data Quality Definition Language (DQDL) format.

        Rules = [
        IsComplete "NAME",
        ColumnLength "EMP_ID" between 1 and 12,
        IsUnique "EMP_ID",
        ColumnValues "INCOME" > 50000
        ]
        """
        result = self.conn.get_data_quality_rule_recommendation_run(RunId=run_id)

        if result.get("RecommendedRuleset"):
            self.log.info(
                "AWS Glue data quality recommended rules for DatabaseName: %s TableName: %s",
                result["DataSource"]["GlueTable"]["DatabaseName"],
                result["DataSource"]["GlueTable"]["TableName"],
            )
            self.log.info(result["RecommendedRuleset"])
        else:
            self.log.info("AWS Glue data quality, no recommended rules available for RunId: %s", run_id)