How to load a BigQuery table from a file in GCS Bucket using Airflow?
Question:
I am new to Airflow, and I am wondering, how do I load a file from a GCS Bucket to BigQuery?
So far, I have managed to do BigQuery to GCS Bucket:
bq_recent_questions_query = bigquery_operator.BigQueryOperator(
task_id='bq_recent_questions_query',
sql="""
SELECT owner_display_name, title, view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
ORDER BY view_count DESC
LIMIT 100
""".format(max_date=max_query_date, min_date=min_query_date),
use_legacy_sql=False,
destination_dataset_table=bq_recent_questions_table_id)
# Export query result to Cloud Storage.
export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
task_id='export_recent_questions_to_gcs',
source_project_dataset_table=bq_recent_questions_table_id,
destination_cloud_storage_uris=[output_file],
export_format='CSV')
Can someone help me to modify my current code, so I can load a file from a GCS Bucket and load it to BigQuery?
Answers:
For your requirement, you can use GCSToBigQueryOperator which is an operator in airflow to transfer files from Cloud Storage to BigQuery.For more information, check this link. You can try the below code.
gcs_to_bq_operator.py
import os
from airflow import DAG
from airflow import models
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCreateEmptyDatasetOperator,
BigQueryDeleteDatasetOperator,
)
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
from airflow.utils.dates import days_ago
DATASET_NAME = os.environ.get("GCP_DATASET_NAME", 'new-dataset')
TABLE_NAME = os.environ.get("GCP_TABLE_NAME", 'Country')
dag = models.DAG(
dag_id='gcs_to_bq_operator',
start_date=days_ago(2),
schedule_interval='@once',
tags=['example'],
)
create_test_dataset = BigQueryCreateEmptyDatasetOperator(
task_id='create_airflow_test_dataset', dataset_id=DATASET_NAME, dag=dag
)
# [START howto_operator_gcs_to_bigquery]
load_csv = GCSToBigQueryOperator(
task_id='gcs_to_bigquery_example',
bucket='sample-bucket',
source_objects=['cloud storage URI'],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=[
{'name': 'Year', 'type': 'INTEGER', 'mode': 'NULLABLE'},
{'name': 'Country', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'number', 'type': 'INTEGER', 'mode': 'NULLABLE'},
{'name': 'result', 'type': 'INTEGER', 'mode': 'NULLABLE'}
],
write_disposition='WRITE_TRUNCATE',
dag=dag,
)
I’m a bit confused about the question, from your title it looks like you want to create a Bigquery table from a file in GCS. I’m assuming this in my answer.
You can leverage the load_file
operator from Astro SDK which allows rapid and clean development of {Extract, Load, Transform} workflows using Python and SQL, powered by Apache Airflow.
load_file
operator can help you transfer data between Blob-storages(Azure, GCS, S3 – Supported locations), File servers(SFTP, FTP) and local filesystem to Databases(Bigquery, Redshift, Snowflake, SQlite, Postgres, duckDB, MSSQL…) for a complete list of supported Databases check – Supported Databases. In some cases you can even optimize the transfer by using a native path which is very useful for larger files(~in >= GBs)
Here is an example –
from datetime import datetime
from airflow.models import DAG
from astro import sql as aql
from astro.files import File
from astro.table import Metadata, Table
default_args = {
"owner": "airflow",
"retries": 1,
"retry_delay": 0,
}
dag = DAG(
dag_id="example_load_file",
start_date=datetime(2019, 1, 1),
max_active_runs=3,
schedule_interval=None,
default_args=default_args,
catchup=False,
)
@aql.run_raw_sql(handler=handle_result)
def filter_table(table: Table):
return """SELECT owner_display_name, title, view_count
FROM {{table}}
WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
ORDER BY view_count DESC
LIMIT 100"""
with dag:
test_table = aql.load_file(
input_file=File("gcs://replace_me/replace_me.csv", conn_id="gcs_conn"),
output_table=Table(conn_id="bigquery_conn")
)
filter_table(test_table)
Please look into the documentation for further help.
I am new to Airflow, and I am wondering, how do I load a file from a GCS Bucket to BigQuery?
So far, I have managed to do BigQuery to GCS Bucket:
bq_recent_questions_query = bigquery_operator.BigQueryOperator(
task_id='bq_recent_questions_query',
sql="""
SELECT owner_display_name, title, view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
ORDER BY view_count DESC
LIMIT 100
""".format(max_date=max_query_date, min_date=min_query_date),
use_legacy_sql=False,
destination_dataset_table=bq_recent_questions_table_id)
# Export query result to Cloud Storage.
export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
task_id='export_recent_questions_to_gcs',
source_project_dataset_table=bq_recent_questions_table_id,
destination_cloud_storage_uris=[output_file],
export_format='CSV')
Can someone help me to modify my current code, so I can load a file from a GCS Bucket and load it to BigQuery?
For your requirement, you can use GCSToBigQueryOperator which is an operator in airflow to transfer files from Cloud Storage to BigQuery.For more information, check this link. You can try the below code.
gcs_to_bq_operator.py
import os
from airflow import DAG
from airflow import models
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryCreateEmptyDatasetOperator,
BigQueryDeleteDatasetOperator,
)
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
from airflow.utils.dates import days_ago
DATASET_NAME = os.environ.get("GCP_DATASET_NAME", 'new-dataset')
TABLE_NAME = os.environ.get("GCP_TABLE_NAME", 'Country')
dag = models.DAG(
dag_id='gcs_to_bq_operator',
start_date=days_ago(2),
schedule_interval='@once',
tags=['example'],
)
create_test_dataset = BigQueryCreateEmptyDatasetOperator(
task_id='create_airflow_test_dataset', dataset_id=DATASET_NAME, dag=dag
)
# [START howto_operator_gcs_to_bigquery]
load_csv = GCSToBigQueryOperator(
task_id='gcs_to_bigquery_example',
bucket='sample-bucket',
source_objects=['cloud storage URI'],
destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
schema_fields=[
{'name': 'Year', 'type': 'INTEGER', 'mode': 'NULLABLE'},
{'name': 'Country', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'number', 'type': 'INTEGER', 'mode': 'NULLABLE'},
{'name': 'result', 'type': 'INTEGER', 'mode': 'NULLABLE'}
],
write_disposition='WRITE_TRUNCATE',
dag=dag,
)
I’m a bit confused about the question, from your title it looks like you want to create a Bigquery table from a file in GCS. I’m assuming this in my answer.
You can leverage the load_file
operator from Astro SDK which allows rapid and clean development of {Extract, Load, Transform} workflows using Python and SQL, powered by Apache Airflow.
load_file
operator can help you transfer data between Blob-storages(Azure, GCS, S3 – Supported locations), File servers(SFTP, FTP) and local filesystem to Databases(Bigquery, Redshift, Snowflake, SQlite, Postgres, duckDB, MSSQL…) for a complete list of supported Databases check – Supported Databases. In some cases you can even optimize the transfer by using a native path which is very useful for larger files(~in >= GBs)
Here is an example –
from datetime import datetime
from airflow.models import DAG
from astro import sql as aql
from astro.files import File
from astro.table import Metadata, Table
default_args = {
"owner": "airflow",
"retries": 1,
"retry_delay": 0,
}
dag = DAG(
dag_id="example_load_file",
start_date=datetime(2019, 1, 1),
max_active_runs=3,
schedule_interval=None,
default_args=default_args,
catchup=False,
)
@aql.run_raw_sql(handler=handle_result)
def filter_table(table: Table):
return """SELECT owner_display_name, title, view_count
FROM {{table}}
WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
ORDER BY view_count DESC
LIMIT 100"""
with dag:
test_table = aql.load_file(
input_file=File("gcs://replace_me/replace_me.csv", conn_id="gcs_conn"),
output_table=Table(conn_id="bigquery_conn")
)
filter_table(test_table)
Please look into the documentation for further help.