How to have multiple input and output connection for an Airflow DAG task use a global variable pandas data frame with in @task.external_python?
Question:
GOAL
- I use the Docker 2.4.1 version of Airflow
- I use my external python virtual environment for each task
- I have a normal python integer that I want to pass on from task to task.
- I should start form 1 graph point the "start" than it should push it’s result to x, y, z than all of the x,y,z should go to "compare" to pick and print out the highest value.
CODE
from __future__ import annotations
import logging
import os
import shutil
import sys
import tempfile
import time
from pprint import pprint
from datetime import timedelta
import pendulum
from airflow import DAG
from airflow.decorators import task
log = logging.getLogger(__name__)
PYTHON = sys.executable
BASE_DIR = tempfile.gettempdir()
''''For Tasks that are essntial and we want to know about the 1st faliure!'''
my_default_args = {
'owner': 'Anonymus',
'email': ['[email protected]'],
'email_on_failure': True,
'email_on_retry': False, #only allow if it was allowed in the scheduler
#'retries': 1, #only allow if it was allowed in the scheduler
#'retry_delay': timedelta(minutes=1)
}
with DAG(
dag_id='sample_many_task_connections',
# https://crontab.guru/
# 0-7, where 0 or 7 is Sunday
# min HOUR DAY_OF_MONTH MONTH DAY_OF_WEEK
# * * * * *
schedule='12 11 * * *', # IT IS AT UTC. EX.: 11:12am UTC = 11:12am GMT = 12:12am BST
start_date=pendulum.datetime(2023, 1, 1, tz="UTC"), # this is from whre it starts counting time to run taks, NOT like cron
catchup=False,
#execution_timeout=timedelta(seconds=60),
default_args=my_default_args,
tags=['sample_tag', 'sample_tag2'], ### !!! also add 'xRetry' to tags so we see if a DAG has rety feature in it
) as dag:
#@task.external_python(task_id="test_external_python_venv_task", python=os.fspath(sys.executable))
@task.external_python(task_id="start", python='/opt/airflow/v1/bin/python3')
def start(): # this could be any function name
start = 1
print(start)
return start
@task.external_python(task_id="random_function_x", python='/opt/airflow/v1/bin/python3')
def random_function_x(start):
import random
print('start: ', start)
x = random.randint(1, 100)
print('x: ', x)
x += start
print('x += start: ', x)
return x
@task.external_python(task_id="random_function_y", python='/opt/airflow/v1/bin/python3')
def random_function_y(start):
import random
print('start: ', start)
y = random.randint(1, 100)
print('y: ', y)
y += start
print('y += start: ', y)
return y
@task.external_python(task_id="random_function_z", python='/opt/airflow/v1/bin/python3')
def random_function_z(start):
import random
print('start: ', start)
z = random.randint(1, 100)
print('z: ', z)
z += start
print('z += start: ', z)
return z
@task.external_python(task_id="compare", python='/opt/airflow/v1/bin/python3')
def compare(x,y,z):
# pick the largest value and return it from x y z and return what value was te largest
if x > y and x > z:
print('x is the largest', x)
return x
elif y > x and y > z:
print('y is the largest', y)
return y
else:
print('z is the largest', z)
return z
compare([random_function_x(start()), random_function_y(start()), random_function_z(start())])
ERROR
error DAG Import Errors (1)
Broken DAG: [/opt/airflow/dags/sample_many_task_connections.py] Traceback (most recent call last):
File "/usr/local/lib/python3.8/inspect.py", line 3037, in bind
return self._bind(args, kwargs)
File "/usr/local/lib/python3.8/inspect.py", line 2952, in _bind
raise TypeError(msg) from None
TypeError: missing a required argument: 'y'
Tried
- based on previosue Q&A I not resolve this issue – How to use a python list as global variable pandas data frame with in @task.external_python? & How to use a python list as global variable python list with in @task.external_python?
Answers:
Unpacking the list of tasks being passed to compare()
will resolve the import error.
compare(*[random_function_x(start()), random_function_y(start()), random_function_z(start())])
But if you want to have 1 node for the start
task, you need to only call it once. Try something like this:
_start = start()
compare(*[random_function_x(_start), random_function_y(_start), random_function_z(_start)])
GOAL
- I use the Docker 2.4.1 version of Airflow
- I use my external python virtual environment for each task
- I have a normal python integer that I want to pass on from task to task.
- I should start form 1 graph point the "start" than it should push it’s result to x, y, z than all of the x,y,z should go to "compare" to pick and print out the highest value.
CODE
from __future__ import annotations
import logging
import os
import shutil
import sys
import tempfile
import time
from pprint import pprint
from datetime import timedelta
import pendulum
from airflow import DAG
from airflow.decorators import task
log = logging.getLogger(__name__)
PYTHON = sys.executable
BASE_DIR = tempfile.gettempdir()
''''For Tasks that are essntial and we want to know about the 1st faliure!'''
my_default_args = {
'owner': 'Anonymus',
'email': ['[email protected]'],
'email_on_failure': True,
'email_on_retry': False, #only allow if it was allowed in the scheduler
#'retries': 1, #only allow if it was allowed in the scheduler
#'retry_delay': timedelta(minutes=1)
}
with DAG(
dag_id='sample_many_task_connections',
# https://crontab.guru/
# 0-7, where 0 or 7 is Sunday
# min HOUR DAY_OF_MONTH MONTH DAY_OF_WEEK
# * * * * *
schedule='12 11 * * *', # IT IS AT UTC. EX.: 11:12am UTC = 11:12am GMT = 12:12am BST
start_date=pendulum.datetime(2023, 1, 1, tz="UTC"), # this is from whre it starts counting time to run taks, NOT like cron
catchup=False,
#execution_timeout=timedelta(seconds=60),
default_args=my_default_args,
tags=['sample_tag', 'sample_tag2'], ### !!! also add 'xRetry' to tags so we see if a DAG has rety feature in it
) as dag:
#@task.external_python(task_id="test_external_python_venv_task", python=os.fspath(sys.executable))
@task.external_python(task_id="start", python='/opt/airflow/v1/bin/python3')
def start(): # this could be any function name
start = 1
print(start)
return start
@task.external_python(task_id="random_function_x", python='/opt/airflow/v1/bin/python3')
def random_function_x(start):
import random
print('start: ', start)
x = random.randint(1, 100)
print('x: ', x)
x += start
print('x += start: ', x)
return x
@task.external_python(task_id="random_function_y", python='/opt/airflow/v1/bin/python3')
def random_function_y(start):
import random
print('start: ', start)
y = random.randint(1, 100)
print('y: ', y)
y += start
print('y += start: ', y)
return y
@task.external_python(task_id="random_function_z", python='/opt/airflow/v1/bin/python3')
def random_function_z(start):
import random
print('start: ', start)
z = random.randint(1, 100)
print('z: ', z)
z += start
print('z += start: ', z)
return z
@task.external_python(task_id="compare", python='/opt/airflow/v1/bin/python3')
def compare(x,y,z):
# pick the largest value and return it from x y z and return what value was te largest
if x > y and x > z:
print('x is the largest', x)
return x
elif y > x and y > z:
print('y is the largest', y)
return y
else:
print('z is the largest', z)
return z
compare([random_function_x(start()), random_function_y(start()), random_function_z(start())])
ERROR
error DAG Import Errors (1)
Broken DAG: [/opt/airflow/dags/sample_many_task_connections.py] Traceback (most recent call last):
File "/usr/local/lib/python3.8/inspect.py", line 3037, in bind
return self._bind(args, kwargs)
File "/usr/local/lib/python3.8/inspect.py", line 2952, in _bind
raise TypeError(msg) from None
TypeError: missing a required argument: 'y'
Tried
- based on previosue Q&A I not resolve this issue – How to use a python list as global variable pandas data frame with in @task.external_python? & How to use a python list as global variable python list with in @task.external_python?
Unpacking the list of tasks being passed to compare()
will resolve the import error.
compare(*[random_function_x(start()), random_function_y(start()), random_function_z(start())])
But if you want to have 1 node for the start
task, you need to only call it once. Try something like this:
_start = start()
compare(*[random_function_x(_start), random_function_y(_start), random_function_z(_start)])