How to parse user_defined_macro in regular function or PythonOperator in Airflow
Question:
We use managed Airflow inside a GCP project.
When I used BigQueryInsertJobOperator to execute queries in a query file, it used to automatically replace user_defined_macros in those files with the set value.
from airflow import DAG
from datetime import datetime
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
BigQueryInsertJobOperator(
task_id = "my_task",
configuration = {
"query": {
"query": "{% include '/queries/my_query.sql' %}",
"useLegacySql": False,
},
},
dag = dag,
)
Due to some reasons, I am switching to using a regular function or PythonOperator to execute those queries using a bigquery client. However, I am unable to figure out how to get the user_defined_macro to be parsed. The below is what I have so far, but doesn’t work of course. Please help.
from airflow import DAG
from datetime import datetime
from google.cloud import bigquery
from airflow.decorators import task
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
@task
def test():
query = open('/home/airflow/gcs/dags/queries/my_query.sql').read()
bq_client = bigquery.Client()
bq_client.query(query).result()
test()
Answers:
In Airflow operators, only the arguments defined in template_fields
attribute are rendered by jinja, and in the PythonOperator
(the operator used in your case), jinja renders op_args
and op_kwargs
arguments, and if your version is 2.4.1+, the argument templates_dict
is rendered too. (PR which fixed the problem)
For Airflow 2.4.1+:
@task(templates_dict={"query": "/home/airflow/gcs/dags/queries/queries/my_query.sql"}, templates_exts=[".sql"])
def test(**context):
query = context["templates_dict"]["query"]
bq_client = bigquery.Client()
bq_client.query(query).result()
test()
For older versions:
class MyCustomOperator(BaseOperator):
template_fields = ("query",)
template_ext = (".sql",)
def __init__(self, query, **kwargs):
super(MyCustomOperator, self).__init__(**kwargs)
self.query = query
def execute(self, context):
bq_client = bigquery.Client()
bq_client.query(self.query).result()
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
test = MyCustomOperator(task_id="test", query="test.sql")
You can use jinja2 templating to render your query.
from jinja2 import Template
...
@task
def test()
with open(<your_templated_file_path>) as f:
template = Template(f.read())
query = template.render(
**<dict_from_macros_or_other>,
)
bq_client = bigquery.Client()
bq_client.query(query).result()
We use managed Airflow inside a GCP project.
When I used BigQueryInsertJobOperator to execute queries in a query file, it used to automatically replace user_defined_macros in those files with the set value.
from airflow import DAG
from datetime import datetime
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
BigQueryInsertJobOperator(
task_id = "my_task",
configuration = {
"query": {
"query": "{% include '/queries/my_query.sql' %}",
"useLegacySql": False,
},
},
dag = dag,
)
Due to some reasons, I am switching to using a regular function or PythonOperator to execute those queries using a bigquery client. However, I am unable to figure out how to get the user_defined_macro to be parsed. The below is what I have so far, but doesn’t work of course. Please help.
from airflow import DAG
from datetime import datetime
from google.cloud import bigquery
from airflow.decorators import task
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
@task
def test():
query = open('/home/airflow/gcs/dags/queries/my_query.sql').read()
bq_client = bigquery.Client()
bq_client.query(query).result()
test()
In Airflow operators, only the arguments defined in template_fields
attribute are rendered by jinja, and in the PythonOperator
(the operator used in your case), jinja renders op_args
and op_kwargs
arguments, and if your version is 2.4.1+, the argument templates_dict
is rendered too. (PR which fixed the problem)
For Airflow 2.4.1+:
@task(templates_dict={"query": "/home/airflow/gcs/dags/queries/queries/my_query.sql"}, templates_exts=[".sql"])
def test(**context):
query = context["templates_dict"]["query"]
bq_client = bigquery.Client()
bq_client.query(query).result()
test()
For older versions:
class MyCustomOperator(BaseOperator):
template_fields = ("query",)
template_ext = (".sql",)
def __init__(self, query, **kwargs):
super(MyCustomOperator, self).__init__(**kwargs)
self.query = query
def execute(self, context):
bq_client = bigquery.Client()
bq_client.query(self.query).result()
with DAG(
'test',
schedule_interval = None,
start_date = datetime(2022, 1, 1),
user_defined_macros = {
"MY_MACRO": "Hello World"
}
) as dag:
test = MyCustomOperator(task_id="test", query="test.sql")
You can use jinja2 templating to render your query.
from jinja2 import Template
...
@task
def test()
with open(<your_templated_file_path>) as f:
template = Template(f.read())
query = template.render(
**<dict_from_macros_or_other>,
)
bq_client = bigquery.Client()
bq_client.query(query).result()