Source code for airflow.providers.google.cloud.openlineage.utils
## Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.from__future__importannotationsimportcopyimportloggingimportosimportpathlibimportrefromcollectionsimportdefaultdictfromcollections.abcimportIterablefromtypingimportTYPE_CHECKING,Anyfromairflow.providers.common.compat.openlineage.facetimport(ColumnLineageDatasetFacet,DatasetFacet,DocumentationDatasetFacet,Fields,Identifier,InputField,SchemaDatasetFacet,SchemaDatasetFacetFields,SymlinksDatasetFacet,)fromairflow.providers.common.compat.openlineage.utils.sparkimport(inject_parent_job_information_into_spark_properties,inject_transport_information_into_spark_properties,)fromairflow.providers.google.cloud.hooks.gcsimport_parse_gcs_urlfromgoogle.cloud.dataproc_v1importBatch,RuntimeConfigifTYPE_CHECKING:fromairflow.providers.common.compat.openlineage.facetimportDatasetfromairflow.utils.contextimportContextfromgoogle.cloud.bigquery.tableimportTable
[docs]defmerge_column_lineage_facets(facets:list[ColumnLineageDatasetFacet])->ColumnLineageDatasetFacet:""" Merge multiple column lineage facets into a single consolidated facet. Specifically, it aggregates input fields and transformations for each field across all provided facets. Args: facets: Column Lineage Facets to be merged. Returns: A new Column Lineage Facet containing all fields, their respective input fields and transformations. Notes: - Input fields are uniquely identified by their `(namespace, name, field)` tuple. - If multiple facets contain the same field with the same input field, those input fields are merged without duplication. - Transformations associated with input fields are also merged. If transformations are not supported by the version of the `InputField` class, they will be omitted. - Transformation merging relies on a composite key of the field name and input field tuple to track and consolidate transformations. Examples: Case 1: Two facets with the same input field ``` >>> facet1 = ColumnLineageDatasetFacet( ... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])} ... ) >>> facet2 = ColumnLineageDatasetFacet( ... fields={"columnA": Fields(inputFields=[InputField("namespace1", "dataset1", "field1")])} ... ) >>> merged = merge_column_lineage_facets([facet1, facet2]) >>> merged.fields["columnA"].inputFields [InputField("namespace1", "dataset1", "field1")] ``` Case 2: Two facets with different transformations for the same input field ``` >>> facet1 = ColumnLineageDatasetFacet( ... fields={ ... "columnA": Fields( ... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t1"])] ... ) ... } ... ) >>> facet2 = ColumnLineageDatasetFacet( ... fields={ ... "columnA": Fields( ... inputFields=[InputField("namespace1", "dataset1", "field1", transformations=["t2"])] ... ) ... } ... ) >>> merged = merge_column_lineage_facets([facet1, facet2]) >>> merged.fields["columnA"].inputFields[0].transformations ["t1", "t2"] ``` """# Dictionary to collect all unique input fields for each field namefields_sources:dict[str,set[tuple[str,str,str]]]=defaultdict(set)# Dictionary to aggregate transformations for each input fieldtransformations:dict[str,list]=defaultdict(list)forfacetinfacets:forfield_name,single_fieldinfacet.fields.items():forinput_fieldinsingle_field.inputFields:input_key_fields=(input_field.namespace,input_field.name,input_field.field)fields_sources[field_name].add(input_key_fields)ifsingle_transformations:=getattr(input_field,"transformations",[]):transformation_key="".join((field_name,*input_key_fields))transformations[transformation_key].extend(single_transformations)# Check if the `InputField` class supports the `transformations` attribute (since OL client 1.17.1)input_field_allows_transformation_info=Truetry:InputField(namespace="a",name="b",field="c",transformations=[])exceptTypeError:input_field_allows_transformation_info=FalsereturnColumnLineageDatasetFacet(fields={field_name:Fields(inputFields=[InputField(namespace,name,column,transformations.get("".join((field_name,namespace,name,column)),[]),)ifinput_field_allows_transformation_infoelseInputField(namespace,name,column)fornamespace,name,columninsorted(input_fields)],transformationType="",# Legacy transformation informationtransformationDescription="",# Legacy transformation information)forfield_name,input_fieldsinfields_sources.items()})
[docs]defextract_ds_name_from_gcs_path(path:str)->str:""" Extract and process the dataset name from a given path. Args: path: The path to process e.g. of a gcs file. Returns: The processed dataset name. Examples: >>> extract_ds_name_from_gcs_path("/dir/file.*") 'dir' >>> extract_ds_name_from_gcs_path("/dir/pre_") 'dir' >>> extract_ds_name_from_gcs_path("/dir/file.txt") 'dir/file.txt' >>> extract_ds_name_from_gcs_path("/dir/file.") 'dir' >>> extract_ds_name_from_gcs_path("/dir/") 'dir' >>> extract_ds_name_from_gcs_path("") '/' >>> extract_ds_name_from_gcs_path("/") '/' >>> extract_ds_name_from_gcs_path(".") '/' """ifWILDCARDinpath:path=path.split(WILDCARD,maxsplit=1)[0]# We want to end up with parent directory if the path:# - does not refer to a file (no dot in the last segment)# and does not explicitly end with a slash, it is treated as a prefix and removed.# Example: "/dir/pre_" -> "/dir/"# - contains a dot at the end, then it is treated as a prefix (created after removing the wildcard).# Example: "/dir/file." (was "/dir/file.*" with wildcard) -> "/dir/"last_path_segment=os.path.basename(path).rstrip(".")if"."notinlast_path_segmentandnotpath.endswith("/"):path=pathlib.Path(path).parent.as_posix()# Normalize the path:# - Remove trailing slashes.# - Remove leading slashes.# - Handle edge cases for empty paths or single-dot paths.path=path.rstrip("/")path=path.lstrip("/")ifpathin("","."):path="/"returnpath
[docs]defget_facets_from_bq_table(table:Table)->dict[str,DatasetFacet]:"""Get facets from BigQuery table object."""facets:dict[str,DatasetFacet]={}iftable.schema:facets["schema"]=SchemaDatasetFacet(fields=[SchemaDatasetFacetFields(name=schema_field.name,type=schema_field.field_type,description=schema_field.description)forschema_fieldintable.schema])iftable.description:facets["documentation"]=DocumentationDatasetFacet(description=table.description)iftable.external_data_configuration:symlinks=get_namespace_name_from_source_uris(table.external_data_configuration.source_uris)facets["symlink"]=SymlinksDatasetFacet(identifiers=[Identifier(namespace=namespace,name=name,type="file"ifnamespace.startswith("gs://")else"table")fornamespace,nameinsorted(symlinks)])returnfacets
[docs]defget_identity_column_lineage_facet(dest_field_names:Iterable[str],input_datasets:Iterable[Dataset],)->dict[str,DatasetFacet]:""" Get column lineage facet for identity transformations. This function generates a simple column lineage facet, where each destination column consists of source columns of the same name from all input datasets that have that column. The lineage assumes there are no transformations applied, meaning the columns retain their identity between the source and destination datasets. Args: dest_field_names: A list of destination column names for which lineage should be determined. input_datasets: A list of input datasets with schema facets. Returns: A dictionary containing a single key, `columnLineage`, mapped to a `ColumnLineageDatasetFacet`. If no column lineage can be determined, an empty dictionary is returned - see Notes below. Notes: - If any input dataset lacks a schema facet, the function immediately returns an empty dictionary. - If any field in the source dataset's schema is not present in the destination table, the function returns an empty dictionary. The destination table can contain extra fields, but all source columns should be present in the destination table. - If none of the destination columns can be matched to input dataset columns, an empty dictionary is returned. - Extra columns in the destination table that do not exist in the input datasets are ignored and skipped in the lineage facet, as they cannot be traced back to a source column. - The function assumes there are no transformations applied, meaning the columns retain their identity between the source and destination datasets. """fields_sources:dict[str,list[Dataset]]={}fordsininput_datasets:ifnotds.facetsor"schema"notinds.facets:return{}forschema_fieldinds.facets["schema"].fields:# type: ignore[attr-defined]ifschema_field.namenotindest_field_names:return{}fields_sources[schema_field.name]=fields_sources.get(schema_field.name,[])+[ds]ifnotfields_sources:return{}column_lineage_facet=ColumnLineageDatasetFacet(fields={field_name:Fields(inputFields=[InputField(namespace=dataset.namespace,name=dataset.name,field=field_name)fordatasetinsource_datasets],transformationType="IDENTITY",transformationDescription="identical",)forfield_name,source_datasetsinfields_sources.items()})return{"columnLineage":column_lineage_facet}
[docs]defget_from_nullable_chain(source:Any,chain:list[str])->Any|None:""" Get object from nested structure of objects, where it's not guaranteed that all keys in the nested structure exist. Intended to replace chain of `dict.get()` statements. Example usage: .. code-block:: python if ( not job._properties.get("statistics") or not job._properties.get("statistics").get("query") or not job._properties.get("statistics").get("query").get("referencedTables") ): return None result = job._properties.get("statistics").get("query").get("referencedTables") becomes: .. code-block:: python result = get_from_nullable_chain(properties, ["statistics", "query", "queryPlan"]) if not result: return None """# chain.pop modifies passed list, this can be unexpectedchain=chain.copy()chain.reverse()try:whilechain:whileisinstance(source,list)andlen(source)==1:source=source[0]next_key=chain.pop()ifisinstance(source,dict):source=source.get(next_key)else:source=getattr(source,next_key)returnsourceexceptAttributeError:returnNone
def_is_openlineage_provider_accessible()->bool:""" Check if the OpenLineage provider is accessible. This function attempts to import the necessary OpenLineage modules and checks if the provider is enabled and the listener is available. Returns: bool: True if the OpenLineage provider is accessible, False otherwise. """try:fromairflow.providers.openlineage.confimportis_disabledfromairflow.providers.openlineage.plugins.listenerimportget_openlineage_listenerexceptImportError:log.debug("OpenLineage provider could not be imported.")returnFalseifis_disabled():log.debug("OpenLineage provider is disabled.")returnFalseifnotget_openlineage_listener():log.debug("OpenLineage listener could not be found.")returnFalsereturnTruedef_extract_supported_job_type_from_dataproc_job(job:dict)->str|None:""" Extract job type from a Dataproc job definition. Args: job: The Dataproc job definition. Returns: The job type for which the automatic OL injection is supported, if found, otherwise None. """supported_job_types=("sparkJob","pysparkJob","spark_job","pyspark_job")returnnext((job_typeforjob_typeinsupported_job_typesifjob_typeinjob),None)def_replace_dataproc_job_properties(job:dict,job_type:str,new_properties:dict)->dict:""" Replace the properties of a specific job type in a Dataproc job definition. Args: job: The original Dataproc job definition. job_type: The key representing the job type (e.g., "sparkJob"). new_properties: The new properties to replace the existing ones. Returns: A modified copy of the job with updated properties. Raises: KeyError: If the job_type does not exist in the job or lacks a "properties" field. """ifjob_typenotinjob:raiseKeyError(f"Job type '{job_type}' is missing in the job definition.")updated_job=job.copy()updated_job[job_type]=job[job_type].copy()updated_job[job_type]["properties"]=new_propertiesreturnupdated_job
[docs]definject_openlineage_properties_into_dataproc_job(job:dict,context:Context,inject_parent_job_info:bool,inject_transport_info:bool)->dict:""" Inject OpenLineage properties into Spark job definition. This function does not remove existing configurations or modify the job definition in any way, except to add the required OpenLineage properties if they are not already present. The entire properties injection process will be skipped if any condition is met: - The OpenLineage provider is not accessible. - The job type is unsupported. - Both `inject_parent_job_info` and `inject_transport_info` are set to False. Additionally, specific information will not be injected if relevant OpenLineage properties already exist. Parent job information will not be injected if: - Any property prefixed with `spark.openlineage.parent` exists. - `inject_parent_job_info` is False. Transport information will not be injected if: - Any property prefixed with `spark.openlineage.transport` exists. - `inject_transport_info` is False. Args: job: The original Dataproc job definition. context: The Airflow context in which the job is running. inject_parent_job_info: Flag indicating whether to inject parent job information. inject_transport_info: Flag indicating whether to inject transport information. Returns: The modified job definition with OpenLineage properties injected, if applicable. """ifnotinject_parent_job_infoandnotinject_transport_info:log.debug("Automatic injection of OpenLineage information is disabled.")returnjobifnot_is_openlineage_provider_accessible():log.warning("Could not access OpenLineage provider for automatic OpenLineage ""properties injection. No action will be performed.")returnjobif(job_type:=_extract_supported_job_type_from_dataproc_job(job))isNone:log.warning("Could not find a supported Dataproc job type for automatic OpenLineage ""properties injection. No action will be performed.",)returnjobproperties=job[job_type].get("properties",{})ifinject_parent_job_info:log.debug("Injecting OpenLineage parent job information into Spark properties.")properties=inject_parent_job_information_into_spark_properties(properties=properties,context=context)ifinject_transport_info:log.debug("Injecting OpenLineage transport information into Spark properties.")properties=inject_transport_information_into_spark_properties(properties=properties,context=context)job_with_ol_config=_replace_dataproc_job_properties(job=job,job_type=job_type,new_properties=properties)returnjob_with_ol_config
def_is_dataproc_batch_of_supported_type(batch:dict|Batch)->bool:""" Check if a Dataproc batch is of a supported type for Openlineage automatic injection. This function determines if the given batch is of a supported type by checking for specific job type attributes or keys in the batch. Args: batch: The Dataproc batch to check. Returns: True if the batch is of a supported type (`spark_batch` or `pyspark_batch`), otherwise False. """supported_job_types=("spark_batch","pyspark_batch")ifisinstance(batch,Batch):ifany(getattr(batch,job_type)forjob_typeinsupported_job_types):returnTruereturnFalse# For dictionary-based batchifany(job_typeinbatchforjob_typeinsupported_job_types):returnTruereturnFalsedef_extract_dataproc_batch_properties(batch:dict|Batch)->dict:""" Extract Dataproc batch properties from a Batch object or dictionary. This function retrieves the `properties` from the `runtime_config` of a Dataproc `Batch` object or a dictionary representation of a batch. Args: batch: The Dataproc batch to extract properties from. Returns: Extracted `properties` if found, otherwise an empty dictionary. """ifisinstance(batch,Batch):returndict(batch.runtime_config.properties)# For dictionary-based batchrun_time_config=batch.get("runtime_config",{})ifisinstance(run_time_config,RuntimeConfig):returndict(run_time_config.properties)returnrun_time_config.get("properties",{})def_replace_dataproc_batch_properties(batch:dict|Batch,new_properties:dict)->dict|Batch:""" Replace the properties of a Dataproc batch. Args: batch: The original Dataproc batch definition. new_properties: The new properties to replace the existing ones. Returns: A modified copy of the Dataproc batch definition with updated properties. """batch=copy.deepcopy(batch)ifisinstance(batch,Batch):ifnotbatch.runtime_config:batch.runtime_config=RuntimeConfig(properties=new_properties)elifisinstance(batch.runtime_config,dict):batch.runtime_config["properties"]=new_propertieselse:batch.runtime_config.properties=new_propertiesreturnbatch# For dictionary-based batchrun_time_config=batch.get("runtime_config")ifnotrun_time_config:batch["runtime_config"]={"properties":new_properties}elifisinstance(run_time_config,dict):run_time_config["properties"]=new_propertieselse:run_time_config.properties=new_propertiesreturnbatch
[docs]definject_openlineage_properties_into_dataproc_batch(batch:dict|Batch,context:Context,inject_parent_job_info:bool,inject_transport_info:bool)->dict|Batch:""" Inject OpenLineage properties into Dataproc batch definition. This function does not remove existing configurations or modify the batch definition in any way, except to add the required OpenLineage properties if they are not already present. The entire properties injection process will be skipped if any condition is met: - The OpenLineage provider is not accessible. - The batch type is unsupported. - Both `inject_parent_job_info` and `inject_transport_info` are set to False. Additionally, specific information will not be injected if relevant OpenLineage properties already exist. Parent job information will not be injected if: - Any property prefixed with `spark.openlineage.parent` exists. - `inject_parent_job_info` is False. Transport information will not be injected if: - Any property prefixed with `spark.openlineage.transport` exists. - `inject_transport_info` is False. Args: batch: The original Dataproc batch definition. context: The Airflow context in which the job is running. inject_parent_job_info: Flag indicating whether to inject parent job information. inject_transport_info: Flag indicating whether to inject transport information. Returns: The modified batch definition with OpenLineage properties injected, if applicable. """ifnotinject_parent_job_infoandnotinject_transport_info:log.debug("Automatic injection of OpenLineage information is disabled.")returnbatchifnot_is_openlineage_provider_accessible():log.warning("Could not access OpenLineage provider for automatic OpenLineage ""properties injection. No action will be performed.")returnbatchifnot_is_dataproc_batch_of_supported_type(batch):log.warning("Could not find a supported Dataproc batch type for automatic OpenLineage ""properties injection. No action will be performed.",)returnbatchproperties=_extract_dataproc_batch_properties(batch)ifinject_parent_job_info:log.debug("Injecting OpenLineage parent job information into Spark properties.")properties=inject_parent_job_information_into_spark_properties(properties=properties,context=context)ifinject_transport_info:log.debug("Injecting OpenLineage transport information into Spark properties.")properties=inject_transport_information_into_spark_properties(properties=properties,context=context)batch_with_ol_config=_replace_dataproc_batch_properties(batch=batch,new_properties=properties)returnbatch_with_ol_config
[docs]definject_openlineage_properties_into_dataproc_workflow_template(template:dict,context:Context,inject_parent_job_info:bool,inject_transport_info:bool)->dict:""" Inject OpenLineage properties into all Spark jobs within Workflow Template. This function does not remove existing configurations or modify the jobs definition in any way, except to add the required OpenLineage properties if they are not already present. The entire properties injection process for each job will be skipped if any condition is met: - The OpenLineage provider is not accessible. - The job type is unsupported. - Both `inject_parent_job_info` and `inject_transport_info` are set to False. Additionally, specific information will not be injected if relevant OpenLineage properties already exist. Parent job information will not be injected if: - Any property prefixed with `spark.openlineage.parent` exists. - `inject_parent_job_info` is False. Transport information will not be injected if: - Any property prefixed with `spark.openlineage.transport` exists. - `inject_transport_info` is False. Args: template: The original Dataproc Workflow Template definition. context: The Airflow context in which the job is running. inject_parent_job_info: Flag indicating whether to inject parent job information. inject_transport_info: Flag indicating whether to inject transport information. Returns: The modified Workflow Template definition with OpenLineage properties injected, if applicable. """ifnotinject_parent_job_infoandnotinject_transport_info:log.debug("Automatic injection of OpenLineage information is disabled.")returntemplateifnot_is_openlineage_provider_accessible():log.warning("Could not access OpenLineage provider for automatic OpenLineage ""properties injection. No action will be performed.")returntemplatefinal_jobs=[]forsingle_job_definitionintemplate["jobs"]:step_id=single_job_definition["step_id"]log.debug("Injecting OpenLineage properties into Workflow step: `%s`",step_id)if(job_type:=_extract_supported_job_type_from_dataproc_job(single_job_definition))isNone:log.debug("Could not find a supported Dataproc job type for automatic OpenLineage ""properties injection. No action will be performed.",)final_jobs.append(single_job_definition)continueproperties=single_job_definition[job_type].get("properties",{})ifinject_parent_job_info:log.debug("Injecting OpenLineage parent job information into Spark properties.")properties=inject_parent_job_information_into_spark_properties(properties=properties,context=context)ifinject_transport_info:log.debug("Injecting OpenLineage transport information into Spark properties.")properties=inject_transport_information_into_spark_properties(properties=properties,context=context)job_with_ol_config=_replace_dataproc_job_properties(job=single_job_definition,job_type=job_type,new_properties=properties)final_jobs.append(job_with_ol_config)template["jobs"]=final_jobsreturntemplate