datopian · luccasmmg · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 3, 2025
diff --git a/aircan/dags/api_ckan_import_to_bq_v2.py b/aircan/dags/api_ckan_import_to_bq_v2.py
@@ -0,0 +1,78 @@
+
+import logging
+import time
+import json
+import ast
+from datetime import date, datetime
+
+# Local imports
+from aircan.dependencies.google_cloud.bigquery_handler_v2 import bq_import_csv
+
+# Third-party library imports
+from airflow import DAG
+from airflow.exceptions import AirflowException
+
+from airflow.models import Variable
+from airflow.operators.python_operator import PythonOperator
+from airflow.utils.dates import days_ago
+
+
+args = {
+    'start_date': days_ago(0),
+    'params': {
+        "resource": {
+            "path": "path/to/my.csv", 
+            "format": "CSV",
+            "ckan_resource_id": "res-id-123",
+            "schema": {
+                "fields": "['field1', 'field2']"
+            } 
+        },
+        "ckan_config": {
+            "api_key": "API_KEY",
+            "site_url": "URL",
+        },
+        "big_query": {
+            "bq_project_id": "bigquery_project_id",
+            "bq_dataset_id": "bigquery_dataset_id"
+        },
+        "output_bucket": str(date.today())
+    }
+}
+
+dag = DAG(
+    dag_id='ckan_api_import_to_bq_v2',
+    default_args=args,
+    schedule_interval=None
+)
+
+def task_import_resource_to_bq(**context):
+    logging.info('Invoking import resource to bigquery')
+    logging.info("resource: {}".format(context['params'].get('resource', {})))
+
+    gc_file_url = context['params'].get('big_query', {}).get('gcs_uri')
+    bq_project_id = context['params'].get('big_query', {}).get('bq_project_id')
+    bq_dataset_id = context['params'].get('big_query', {}).get('bq_dataset_id')
+    bq_table_name = context['params'].get('big_query', {}).get('bq_table_name')
+    logging.info("bq_table_name: {}".format(bq_table_name))
+
+    raw_schema = context['params'].get('resource', {}).get('schema')
+    eval_schema = json.loads(raw_schema)
+    if isinstance(eval_schema, str):
+        eval_schema = ast.literal_eval(eval_schema)
+    schema = eval_schema.get('fields')
+    logging.info("SCHEMA: {}".format(schema))
+
+    # sample bq_table_id: "bigquerytest-271707.nhs_test.dag_test"
+    bq_table_id = '%s.%s.%s' % (bq_project_id, bq_dataset_id, bq_table_name)          
+    logging.info('Importing %s to BQ %s' % (gc_file_url, bq_table_id))
+    ckan_conf = context['params'].get('ckan_config', {})
+    ckan_conf['resource_id'] = context['params'].get('resource', {}).get('ckan_resource_id')
+    bq_import_csv(bq_table_id, gc_file_url, schema, ckan_conf)
+
+import_resource_to_bq_task = PythonOperator(
+    task_id='import_resource_to_bq_v2',
+    provide_context=True,
+    python_callable=task_import_resource_to_bq,
+    dag=dag,
+)
diff --git a/aircan/dependencies/google_cloud/bigquery_handler_v2.py b/aircan/dependencies/google_cloud/bigquery_handler_v2.py
@@ -0,0 +1,95 @@
+from google.cloud import bigquery
+import google.api_core.exceptions
+from aircan.dependencies.utils import AirflowCKANException, aircan_status_update
+import json
+import logging
+
+def replace_all(dict, string):
+    for key in dict:
+        string = string.replace(key, dict[key])
+    return string
+
+def bq_import_csv(table_id, gcs_path, table_schema, ckan_conf):
+    try:
+        client = bigquery.Client()
+
+        job_config = bigquery.LoadJobConfig()
+
+        schema = bq_schema_from_table_schema(table_schema)
+        job_config.schema = schema
+
+        job_config.skip_leading_rows = 1
+        job_config.source_format = bigquery.SourceFormat.CSV
+        # overwrite a Table
+        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+        # set 'True' for schema autodetect but turning it off since we define schema in explicitly when publishing data using datapub
+        # job_config.autodetect = True
+        load_job = client.load_table_from_uri(
+            gcs_path, table_id, job_config=job_config
+        )
+
+        load_job.result()  # Waits for table load to complete.
+        destination_table = client.get_table(table_id)
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'progress',
+            'message': 'Data ingestion is in progress.'
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
-        status_dict = {
-            'res_id': ckan_conf.get('resource_id'),
-            'state': 'progress',
-            'message': 'Data ingestion is in progress.'
-        }
-        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+        # Update status before starting ingestion
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'progress',
+            'message': 'Data ingestion is in progress.'
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+
+        load_job = client.load_table_from_uri(
+            gcs_path, table_id, job_config=job_config
+        )
+
+        load_job.result()  # Waits for table load to complete.
+        destination_table = client.get_table(table_id)
-        status_dict = {
-            'res_id': ckan_conf.get('resource_id'),
-            'state': 'progress',
-            'message': 'Data ingestion is in progress.'
-        }
-        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+        # Update status before starting ingestion
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'progress',
+            'message': 'Data ingestion is in progress.'
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+
+        load_job = client.load_table_from_uri(
+            gcs_path, table_id, job_config=job_config
+        )
+
+        load_job.result()  # Waits for table load to complete.
+        destination_table = client.get_table(table_id)
+        if destination_table:
+            status_dict = {
+                'res_id': ckan_conf.get('resource_id'),
+                'state': 'complete',
+                'message': "Ingession Completed"
+            }
+            aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+            return {'success': True, 'message': 'BigQuery Table created successfully.'}
-        if destination_table:
-            status_dict = {
-                'res_id': ckan_conf.get('resource_id'),
-                'state': 'complete',
-                'message': "Ingession Completed"
-            }
-            aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
-            return {'success': True, 'message': 'BigQuery Table created successfully.'}
+        # Update status to complete
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'complete',
+            'message': "Ingestion Completed"
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+        return {'success': True, 'message': 'BigQuery Table created successfully.'}
-        if destination_table:
-            status_dict = {
-                'res_id': ckan_conf.get('resource_id'),
-                'state': 'complete',
-                'message': "Ingession Completed"
-            }
-            aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
-            return {'success': True, 'message': 'BigQuery Table created successfully.'}
+        # Update status to complete
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'complete',
+            'message': "Ingestion Completed"
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+        return {'success': True, 'message': 'BigQuery Table created successfully.'}
+    except Exception as e:
+        replacers = {
+            'gs://dx-nhs-staging-giftless/': '',
+            'gs://dx-nhs-production-giftless/': '',
+            'gs://dx-nhs-prod-giftless/': '',
+            'https://bigquery.googleapis.com/bigquery/v2/projects/datopian-dx/jobs?prettyPrint=false': '',
+            'datopian-dx': '',
+            'bigquery': '',
+            'googleapi': '',
+            'google': ''
+
+        }
+        e = replace_all(replacers,str(e))
+        logging.info(e)
+        status_dict = {
+            'res_id': ckan_conf.get('resource_id'),
+            'state': 'failed',
+            'message': str(e)
+        }
+        aircan_status_update(ckan_conf.get('site_url'), ckan_conf.get('api_key'), status_dict)
+        raise AirflowCKANException('Data ingestion has failed.', str(e))
-        raise AirflowCKANException('Data ingestion has failed.', str(e))
+        raise AirflowCKANException('Data ingestion has failed.', str(e)) from e
-        raise AirflowCKANException('Data ingestion has failed.', str(e))
+        raise AirflowCKANException('Data ingestion has failed.', str(e)) from e
+
+
+def bq_schema_from_table_schema(table_schema):
+    mapping = {
+        'string': 'STRING',
+        'number': 'NUMERIC',
+        'integer': 'NUMERIC',
+        'boolean': 'BOOLEAN',
+        'object': 'STRING',
+        'array': 'STRING',
+        'date': 'DATE',
+        'time': 'TIME',
+        'datetime': 'DATETIME',
+        'year': 'NUMERIC',
+        'yearmonth': 'STRING',
+        'duration': 'DATETIME',
+        'geopoint': 'GEOPOINT',
+        'geojson': 'STRING',
+        'any': 'STRING'
+    }
+
+    def _convert(field):
+        # Â TODO: support for e.g. required
+        return bigquery.SchemaField(field['name'],
+                                    mapping.get(field['type'], field['type']),
+                                    'NULLABLE'
+                                    )
+    return [_convert(field) for field in table_schema]