# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import asyncio
import datetime
import traceback
import warnings
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, Any, AsyncIterator
from airflow.exceptions import AirflowProviderDeprecationWarning
from airflow.providers.cncf.kubernetes.hooks.kubernetes import AsyncKubernetesHook
from airflow.providers.cncf.kubernetes.utils.pod_manager import (
OnFinishAction,
PodLaunchTimeoutException,
PodPhase,
)
from airflow.triggers.base import BaseTrigger, TriggerEvent
if TYPE_CHECKING:
from kubernetes_asyncio.client.models import V1Pod
from pendulum import DateTime
[docs]class ContainerState(str, Enum):
"""
Possible container states.
See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase.
"""
[docs] TERMINATED = "terminated"
[docs] UNDEFINED = "undefined"
[docs]class KubernetesPodTrigger(BaseTrigger):
"""
KubernetesPodTrigger run on the trigger worker to check the state of Pod.
:param pod_name: The name of the pod.
:param pod_namespace: The namespace of the pod.
:param kubernetes_conn_id: The :ref:`kubernetes connection id <howto/connection:kubernetes>`
for the Kubernetes cluster.
:param cluster_context: Context that points to kubernetes cluster.
:param config_file: Path to kubeconfig file.
:param poll_interval: Polling period in seconds to check for the status.
:param trigger_start_time: time in Datetime format when the trigger was started
:param in_cluster: run kubernetes client with in_cluster configuration.
:param get_logs: get the stdout of the container as logs of the tasks.
:param startup_timeout: timeout in seconds to start up the pod.
:param startup_check_interval: interval in seconds to check if the pod has already started.
:param on_finish_action: What to do when the pod reaches its final state, or the execution is interrupted.
If "delete_pod", the pod will be deleted regardless its state; if "delete_succeeded_pod",
only succeeded pod will be deleted. You can set to "keep_pod" to keep the pod.
:param should_delete_pod: What to do when the pod reaches its final
state, or the execution is interrupted. If True (default), delete the
pod; if False, leave the pod.
Deprecated - use `on_finish_action` instead.
:param logging_interval: number of seconds to wait before kicking it back to
the operator to print latest logs. If ``None`` will wait until container done.
:param last_log_time: where to resume logs from
"""
def __init__(
self,
pod_name: str,
pod_namespace: str,
trigger_start_time: datetime.datetime,
base_container_name: str,
kubernetes_conn_id: str | None = None,
poll_interval: float = 2,
cluster_context: str | None = None,
config_file: str | None = None,
in_cluster: bool | None = None,
get_logs: bool = True,
startup_timeout: int = 120,
startup_check_interval: int = 5,
on_finish_action: str = "delete_pod",
should_delete_pod: bool | None = None,
last_log_time: DateTime | None = None,
logging_interval: int | None = None,
):
super().__init__()
self.pod_name = pod_name
self.pod_namespace = pod_namespace
self.trigger_start_time = trigger_start_time
self.base_container_name = base_container_name
self.kubernetes_conn_id = kubernetes_conn_id
self.poll_interval = poll_interval
self.cluster_context = cluster_context
self.config_file = config_file
self.in_cluster = in_cluster
self.get_logs = get_logs
self.startup_timeout = startup_timeout
self.startup_check_interval = startup_check_interval
self.last_log_time = last_log_time
self.logging_interval = logging_interval
if should_delete_pod is not None:
warnings.warn(
"`should_delete_pod` parameter is deprecated, please use `on_finish_action`",
category=AirflowProviderDeprecationWarning,
stacklevel=2,
)
self.on_finish_action = (
OnFinishAction.DELETE_POD if should_delete_pod else OnFinishAction.KEEP_POD
)
self.should_delete_pod = should_delete_pod
else:
self.on_finish_action = OnFinishAction(on_finish_action)
self.should_delete_pod = self.on_finish_action == OnFinishAction.DELETE_POD
self._since_time = None
[docs] def serialize(self) -> tuple[str, dict[str, Any]]:
"""Serialize KubernetesCreatePodTrigger arguments and classpath."""
return (
"airflow.providers.cncf.kubernetes.triggers.pod.KubernetesPodTrigger",
{
"pod_name": self.pod_name,
"pod_namespace": self.pod_namespace,
"base_container_name": self.base_container_name,
"kubernetes_conn_id": self.kubernetes_conn_id,
"poll_interval": self.poll_interval,
"cluster_context": self.cluster_context,
"config_file": self.config_file,
"in_cluster": self.in_cluster,
"get_logs": self.get_logs,
"startup_timeout": self.startup_timeout,
"startup_check_interval": self.startup_check_interval,
"trigger_start_time": self.trigger_start_time,
"should_delete_pod": self.should_delete_pod,
"on_finish_action": self.on_finish_action.value,
"last_log_time": self.last_log_time,
"logging_interval": self.logging_interval,
},
)
[docs] async def run(self) -> AsyncIterator[TriggerEvent]: # type: ignore[override]
"""Get current pod status and yield a TriggerEvent."""
self.log.info("Checking pod %r in namespace %r.", self.pod_name, self.pod_namespace)
try:
state = await self._wait_for_pod_start()
if state == ContainerState.TERMINATED:
event = TriggerEvent(
{
"status": "success",
"namespace": self.pod_namespace,
"name": self.pod_name,
"message": "All containers inside pod have started successfully.",
}
)
elif state == ContainerState.FAILED:
event = TriggerEvent(
{
"status": "failed",
"namespace": self.pod_namespace,
"name": self.pod_name,
"message": "pod failed",
}
)
else:
event = await self._wait_for_container_completion()
yield event
return
except PodLaunchTimeoutException as e:
message = self._format_exception_description(e)
yield TriggerEvent(
{
"name": self.pod_name,
"namespace": self.pod_namespace,
"status": "timeout",
"message": message,
}
)
return
except Exception as e:
yield TriggerEvent(
{
"name": self.pod_name,
"namespace": self.pod_namespace,
"status": "error",
"message": str(e),
"stack_trace": traceback.format_exc(),
}
)
return
def _format_exception_description(self, exc: Exception) -> Any:
if isinstance(exc, PodLaunchTimeoutException):
return exc.args[0]
description = f"Trigger {self.__class__.__name__} failed with exception {exc.__class__.__name__}."
message = exc.args and exc.args[0] or ""
if message:
description += f"\ntrigger exception message: {message}"
curr_traceback = traceback.format_exc()
description += f"\ntrigger traceback:\n{curr_traceback}"
return description
async def _wait_for_pod_start(self) -> ContainerState:
"""Loops until pod phase leaves ``PENDING`` If timeout is reached, throws error."""
delta = datetime.datetime.now(tz=datetime.timezone.utc) - self.trigger_start_time
while self.startup_timeout >= delta.total_seconds():
pod = await self.hook.get_pod(self.pod_name, self.pod_namespace)
if not pod.status.phase == "Pending":
return self.define_container_state(pod)
self.log.info("Still waiting for pod to start. The pod state is %s", pod.status.phase)
await asyncio.sleep(self.startup_check_interval)
delta = datetime.datetime.now(tz=datetime.timezone.utc) - self.trigger_start_time
raise PodLaunchTimeoutException("Pod did not leave 'Pending' phase within specified timeout")
async def _wait_for_container_completion(self) -> TriggerEvent:
"""
Wait for container completion.
Waits until container is no longer in running state. If trigger is configured with a logging period,
then will emit an event to resume the task for the purpose of fetching more logs.
"""
time_begin = datetime.datetime.now(tz=datetime.timezone.utc)
time_get_more_logs = None
if self.logging_interval is not None:
time_get_more_logs = time_begin + datetime.timedelta(seconds=self.logging_interval)
while True:
pod = await self.hook.get_pod(self.pod_name, self.pod_namespace)
container_state = self.define_container_state(pod)
if container_state == ContainerState.TERMINATED:
return TriggerEvent(
{
"status": "success",
"namespace": self.pod_namespace,
"name": self.pod_name,
"last_log_time": self.last_log_time,
}
)
elif container_state == ContainerState.FAILED:
return TriggerEvent(
{
"status": "failed",
"namespace": self.pod_namespace,
"name": self.pod_name,
"message": "Container state failed",
"last_log_time": self.last_log_time,
}
)
if time_get_more_logs and datetime.datetime.now(tz=datetime.timezone.utc) > time_get_more_logs:
return TriggerEvent(
{
"status": "running",
"last_log_time": self.last_log_time,
"namespace": self.pod_namespace,
"name": self.pod_name,
}
)
await asyncio.sleep(self.poll_interval)
def _get_async_hook(self) -> AsyncKubernetesHook:
# TODO: Remove this method when the min version of kubernetes provider is 7.12.0 in Google provider.
return AsyncKubernetesHook(
conn_id=self.kubernetes_conn_id,
in_cluster=self.in_cluster,
config_file=self.config_file,
cluster_context=self.cluster_context,
)
@cached_property
[docs] def hook(self) -> AsyncKubernetesHook:
return self._get_async_hook()
[docs] def define_container_state(self, pod: V1Pod) -> ContainerState:
pod_containers = pod.status.container_statuses
if pod_containers is None:
return ContainerState.UNDEFINED
container = next(c for c in pod_containers if c.name == self.base_container_name)
for state in (ContainerState.RUNNING, ContainerState.WAITING, ContainerState.TERMINATED):
state_obj = getattr(container.state, state)
if state_obj is not None:
if state != ContainerState.TERMINATED:
return state
else:
return ContainerState.TERMINATED if state_obj.exit_code == 0 else ContainerState.FAILED
return ContainerState.UNDEFINED
@staticmethod
[docs] def should_wait(pod_phase: PodPhase, container_state: ContainerState) -> bool:
return (
container_state == ContainerState.WAITING
or container_state == ContainerState.RUNNING
or (container_state == ContainerState.UNDEFINED and pod_phase == PodPhase.PENDING)
)