Airflow 2.0 task getting skipped after BranchPython Operator
Question:
I’m fiddling with branches in Airflow in the new version and no matter what I try, all the tasks after the BranchOperator get skipped.
Here is a minimal example of what I’ve been trying to accomplish
from airflow.decorators import dag, task
from datetime import timedelta, datetime
from airflow.operators.python import BranchPythonOperator
from airflow.utils.trigger_rule import TriggerRule
import logging
logger = logging.getLogger("airflow.task")
@dag(
schedule_interval="0 0 * * *",
start_date=datetime.today() - timedelta(days=2),
dagrun_timeout=timedelta(minutes=60),
)
def StackOverflowExample():
@task
def task_A():
logging.info("TASK A")
@task
def task_B():
logging.info("TASK B")
@task
def task_C():
logging.info("TASK C")
@task
def task_D():
logging.info("TASK D")
return {"parameter":0.5}
def _choose_task(task_parameters,**kwargs):
logging.info(task_parameters["parameter"])
if task_parameters["parameter"]<0.5:
logging.info("SUCCESSS ")
return ['branch_1', 'task_final']
else:
logging.info("RIP")
return ['branch_2', 'task_final']
@task(task_id="branch_1")
def branch_1():
logging.info("branch_1...")
@task(task_id="branch_2")
def branch_2():
logging.info("branch_2")
@task(task_id="task_final")
def task_final():
logging.info("task_final")
parameter = task_A() >> task_B() >> task_C() >> task_D()
choose_task = BranchPythonOperator(
task_id='choose_best_model',
op_kwargs={"task_parameters":parameter},
python_callable=_choose_task,
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS
)
choose_task >> [branch_1(), branch_2()] >> task_final()
dag = StackOverflowExample ()
Any clues? I’m suspicious of the trigger rule. I’m an Airflow beginner so I wouldn’t discard any other problems I’m overlooking
Answers:
You should set Trigger rule on task_final
.
You wish task_final
to be executed after branch_1
and branch_2
finished their execution (regardless of which one of them was executed/skipped) so you need to set all done trigger rule:
@task(task_id="task_final", trigger_rule=TriggerRule.ALL_DONE)
def task_final():
logging.info("task_final")
following the same example – I want to pass values between tasks from Task A to task_final.
I’ve simplified the above example, so I got only task_A and task_D.
from airflow.decorators import dag, task
from datetime import timedelta, datetime
from typing import Dict
from airflow.operators.python import BranchPythonOperator
from airflow.utils.trigger_rule import TriggerRule
import logging
logger = logging.getLogger("airflow.task")
@dag(
schedule_interval="0 0 * * *",
start_date=datetime.today() - timedelta(days=2),
dagrun_timeout=timedelta(minutes=60),
)
def StackOverflowExample():
@task
def task_A(**kwargs) -> Dict[str, str]:
inp = kwargs['dag_run'].conf.get('dummy')
logging.info("TASK A")
logging.info("dag dummy:" + str(inp))
return{'dummy':inp}
@task
def task_D(param: Dict[str, str]) -> Dict[str, str]:
logging.info("TASK D")
logging.info(param['dummy'])
return param
def _choose_task(param: Dict[str, str]):
logging.info('-----choose_task')
logging.info(param["dummy"])
if param["dummy"] < 10:
logging.info("SUCCESSS ")
return ['branch_1', 'task_final']
else:
logging.info("else")
return ['branch_2', 'task_final']
@task(task_id="branch_1")
#def branch_1(param: Dict[str, str]) -> Dict[str, str]:
def branch_1():
logging.info("branch_1...")
#logging.info('dummy ' + str(param['dummy']))
# return param
@task(task_id="branch_2")
# def branch_2(param: Dict[str, str]) -> Dict[str, str]:
def branch_2():
logging.info("branch_2")
# logging.info('dummy ' + str(param['dummy']))
# return param
@task(task_id="task_final", trigger_rule=TriggerRule.ALL_DONE)
def task_final():
logging.info("task_final")
parameter = task_D(task_A())
choose_task = BranchPythonOperator(
task_id='choose_best_model',
op_kwargs={"task_parameters": parameter},
python_callable=_choose_task,
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS
)
choose_task >> [branch_1(), branch_2()] >> task_final()
dag = StackOverflowExample()
When I run this I get the error:
Traceback (most recent call last):
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 206, in execute
branch = super().execute(context)
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 174, in execute
return_value = self.execute_callable()
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 188, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
TypeError: _choose_task() missing 1 required positional argument: ‘param’
Any idea what am I missing please? I want to pass the "dummy" value to "branch_1" and "branch_2"
I’m fiddling with branches in Airflow in the new version and no matter what I try, all the tasks after the BranchOperator get skipped.
Here is a minimal example of what I’ve been trying to accomplish
from airflow.decorators import dag, task
from datetime import timedelta, datetime
from airflow.operators.python import BranchPythonOperator
from airflow.utils.trigger_rule import TriggerRule
import logging
logger = logging.getLogger("airflow.task")
@dag(
schedule_interval="0 0 * * *",
start_date=datetime.today() - timedelta(days=2),
dagrun_timeout=timedelta(minutes=60),
)
def StackOverflowExample():
@task
def task_A():
logging.info("TASK A")
@task
def task_B():
logging.info("TASK B")
@task
def task_C():
logging.info("TASK C")
@task
def task_D():
logging.info("TASK D")
return {"parameter":0.5}
def _choose_task(task_parameters,**kwargs):
logging.info(task_parameters["parameter"])
if task_parameters["parameter"]<0.5:
logging.info("SUCCESSS ")
return ['branch_1', 'task_final']
else:
logging.info("RIP")
return ['branch_2', 'task_final']
@task(task_id="branch_1")
def branch_1():
logging.info("branch_1...")
@task(task_id="branch_2")
def branch_2():
logging.info("branch_2")
@task(task_id="task_final")
def task_final():
logging.info("task_final")
parameter = task_A() >> task_B() >> task_C() >> task_D()
choose_task = BranchPythonOperator(
task_id='choose_best_model',
op_kwargs={"task_parameters":parameter},
python_callable=_choose_task,
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS
)
choose_task >> [branch_1(), branch_2()] >> task_final()
dag = StackOverflowExample ()
Any clues? I’m suspicious of the trigger rule. I’m an Airflow beginner so I wouldn’t discard any other problems I’m overlooking
You should set Trigger rule on task_final
.
You wish task_final
to be executed after branch_1
and branch_2
finished their execution (regardless of which one of them was executed/skipped) so you need to set all done trigger rule:
@task(task_id="task_final", trigger_rule=TriggerRule.ALL_DONE)
def task_final():
logging.info("task_final")
following the same example – I want to pass values between tasks from Task A to task_final.
I’ve simplified the above example, so I got only task_A and task_D.
from airflow.decorators import dag, task
from datetime import timedelta, datetime
from typing import Dict
from airflow.operators.python import BranchPythonOperator
from airflow.utils.trigger_rule import TriggerRule
import logging
logger = logging.getLogger("airflow.task")
@dag(
schedule_interval="0 0 * * *",
start_date=datetime.today() - timedelta(days=2),
dagrun_timeout=timedelta(minutes=60),
)
def StackOverflowExample():
@task
def task_A(**kwargs) -> Dict[str, str]:
inp = kwargs['dag_run'].conf.get('dummy')
logging.info("TASK A")
logging.info("dag dummy:" + str(inp))
return{'dummy':inp}
@task
def task_D(param: Dict[str, str]) -> Dict[str, str]:
logging.info("TASK D")
logging.info(param['dummy'])
return param
def _choose_task(param: Dict[str, str]):
logging.info('-----choose_task')
logging.info(param["dummy"])
if param["dummy"] < 10:
logging.info("SUCCESSS ")
return ['branch_1', 'task_final']
else:
logging.info("else")
return ['branch_2', 'task_final']
@task(task_id="branch_1")
#def branch_1(param: Dict[str, str]) -> Dict[str, str]:
def branch_1():
logging.info("branch_1...")
#logging.info('dummy ' + str(param['dummy']))
# return param
@task(task_id="branch_2")
# def branch_2(param: Dict[str, str]) -> Dict[str, str]:
def branch_2():
logging.info("branch_2")
# logging.info('dummy ' + str(param['dummy']))
# return param
@task(task_id="task_final", trigger_rule=TriggerRule.ALL_DONE)
def task_final():
logging.info("task_final")
parameter = task_D(task_A())
choose_task = BranchPythonOperator(
task_id='choose_best_model',
op_kwargs={"task_parameters": parameter},
python_callable=_choose_task,
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS
)
choose_task >> [branch_1(), branch_2()] >> task_final()
dag = StackOverflowExample()
When I run this I get the error:
Traceback (most recent call last):
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 206, in execute
branch = super().execute(context)
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 174, in execute
return_value = self.execute_callable()
File "/opt/python3.8/lib/python3.8/site-packages/airflow/operators/python.py", line 188, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
TypeError: _choose_task() missing 1 required positional argument: ‘param’
Any idea what am I missing please? I want to pass the "dummy" value to "branch_1" and "branch_2"