Source code for airflow.utils.sqlalchemy

# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import datetime
import os
import json
import pendulum
import time
import random

from dateutil import relativedelta
from sqlalchemy import event, exc, select
from sqlalchemy.types import Text, DateTime, TypeDecorator

from airflow.utils.log.logging_mixin import LoggingMixin

log = LoggingMixin().log
utc = pendulum.timezone('UTC')


def setup_event_handlers(engine,
                         reconnect_timeout_seconds,
                         initial_backoff_seconds=0.2,
                         max_backoff_seconds=120):
    @event.listens_for(engine, "engine_connect")
    def ping_connection(connection, branch):
        """
        Pessimistic SQLAlchemy disconnect handling. Ensures that each
        connection returned from the pool is properly connected to the database.

        http://docs.sqlalchemy.org/en/rel_1_1/core/pooling.html#disconnect-handling-pessimistic
        """
        if branch:
            # "branch" refers to a sub-connection of a connection,
            # we don't want to bother pinging on these.
            return

        start = time.time()
        backoff = initial_backoff_seconds

        # turn off "close with result".  This flag is only used with
        # "connectionless" execution, otherwise will be False in any case
        save_should_close_with_result = connection.should_close_with_result

        while True:
            connection.should_close_with_result = False

            try:
                connection.scalar(select([1]))
                # If we made it here then the connection appears to be healthy
                break
            except exc.DBAPIError as err:
                if time.time() - start >= reconnect_timeout_seconds:
                    log.error(
                        "Failed to re-establish DB connection within %s secs: %s",
                        reconnect_timeout_seconds,
                        err)
                    raise
                if err.connection_invalidated:
                    log.warning("DB connection invalidated. Reconnecting...")

                    # Use a truncated binary exponential backoff. Also includes
                    # a jitter to prevent the thundering herd problem of
                    # simultaneous client reconnects
                    backoff += backoff * random.random()
                    time.sleep(min(backoff, max_backoff_seconds))

                    # run the same SELECT again - the connection will re-validate
                    # itself and establish a new connection.  The disconnect detection
                    # here also causes the whole connection pool to be invalidated
                    # so that all stale connections are discarded.
                    continue
                else:
                    log.error(
                        "Unknown database connection error. Not retrying: %s",
                        err)
                    raise
            finally:
                # restore "close with result"
                connection.should_close_with_result = save_should_close_with_result

    @event.listens_for(engine, "connect")
    def connect(dbapi_connection, connection_record):
        connection_record.info['pid'] = os.getpid()

    if engine.dialect.name == "sqlite":
        @event.listens_for(engine, "connect")
        def set_sqlite_pragma(dbapi_connection, connection_record):
            cursor = dbapi_connection.cursor()
            cursor.execute("PRAGMA foreign_keys=ON")
            cursor.close()

    # this ensures sanity in mysql when storing datetimes (not required for postgres)
    if engine.dialect.name == "mysql":
        @event.listens_for(engine, "connect")
        def set_mysql_timezone(dbapi_connection, connection_record):
            cursor = dbapi_connection.cursor()
            cursor.execute("SET time_zone = '+00:00'")
            cursor.close()

    @event.listens_for(engine, "checkout")
    def checkout(dbapi_connection, connection_record, connection_proxy):
        pid = os.getpid()
        if connection_record.info['pid'] != pid:
            connection_record.connection = connection_proxy.connection = None
            raise exc.DisconnectionError(
                "Connection record belongs to pid {}, "
                "attempting to check out in pid {}".format(connection_record.info['pid'], pid)
            )


[docs]class UtcDateTime(TypeDecorator): """ Almost equivalent to :class:`~sqlalchemy.types.DateTime` with ``timezone=True`` option, but it differs from that by: - Never silently take naive :class:`~datetime.datetime`, instead it always raise :exc:`ValueError` unless time zone aware value. - :class:`~datetime.datetime` value's :attr:`~datetime.datetime.tzinfo` is always converted to UTC. - Unlike SQLAlchemy's built-in :class:`~sqlalchemy.types.DateTime`, it never return naive :class:`~datetime.datetime`, but time zone aware value, even with SQLite or MySQL. - Always returns DateTime in UTC """ impl = DateTime(timezone=True) def process_bind_param(self, value, dialect): if value is not None: if not isinstance(value, datetime.datetime): raise TypeError('expected datetime.datetime, not ' + repr(value)) elif value.tzinfo is None: raise ValueError('naive datetime is disallowed') return value.astimezone(utc) def process_result_value(self, value, dialect): """ Processes DateTimes from the DB making sure it is always returning UTC. Not using timezone.convert_to_utc as that converts to configured TIMEZONE while the DB might be running with some other setting. We assume UTC datetimes in the database. """ if value is not None: if value.tzinfo is None: value = value.replace(tzinfo=utc) else: value = value.astimezone(utc) return value
[docs]class Interval(TypeDecorator): impl = Text attr_keys = { datetime.timedelta: ('days', 'seconds', 'microseconds'), relativedelta.relativedelta: ( 'years', 'months', 'days', 'leapdays', 'hours', 'minutes', 'seconds', 'microseconds', 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', ), } def process_bind_param(self, value, dialect): if type(value) in self.attr_keys: attrs = { key: getattr(value, key) for key in self.attr_keys[type(value)] } return json.dumps({'type': type(value).__name__, 'attrs': attrs}) return json.dumps(value) def process_result_value(self, value, dialect): if not value: return value data = json.loads(value) if isinstance(data, dict): type_map = {key.__name__: key for key in self.attr_keys} return type_map[data['type']](**data['attrs']) return data