Merge branch 'master-queensland' into feature/qld-gov-au/remove-unsupported-datastore-tables

JVickery-TBS · JVickery-TBS · commit e454e907245e · 2024-01-31T16:59:29.000Z
# Conflicts:
#	ckanext/xloader/plugin.py
### RESOLVED.
diff --git a/README.rst b/README.rst
@@ -191,6 +191,11 @@ Configuration:
 
 See the extension's `config_declaration.yaml <ckanext/xloader/config_declaration.yaml>`_ file.
 
+You may also wish to configure the database to use your preferred date input style on COPY.
+For example, to make [PostgreSQL](https://www.postgresql.org/docs/current/runtime-config-client.html#RUNTIME-CONFIG-CLIENT-FORMAT)
+expect European (day-first) dates, you could add to ``postgresql.conf``:
+
+    datestyle=ISO,DMY
 
 ------------------------
 Developer installation
diff --git a/ckanext/xloader/db.py b/ckanext/xloader/db.py
@@ -191,9 +191,7 @@ def add_pending_job(job_id, job_type, api_key,
     if not metadata:
         metadata = {}
 
-    conn = ENGINE.connect()
-    trans = conn.begin()
-    try:
+    with ENGINE.begin() as conn:
         conn.execute(JOBS_TABLE.insert().values(
             job_id=job_id,
             job_type=job_type,
@@ -225,12 +223,6 @@ def add_pending_job(job_id, job_type, api_key,
             )
         if inserts:
             conn.execute(METADATA_TABLE.insert(), inserts)
-        trans.commit()
-    except Exception:
-        trans.rollback()
-        raise
-    finally:
-        conn.close()
 
 
 class InvalidErrorObjectError(Exception):
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
@@ -11,13 +11,14 @@
 import traceback
 import sys
 
+from psycopg2 import errors
 from six.moves.urllib.parse import urlsplit
 import requests
 from rq import get_current_job
 import sqlalchemy as sa
 
 from ckan import model
-from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
+from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config
 
 from . import db, loader
 from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
@@ -41,6 +42,15 @@
 CHUNK_SIZE = 16 * 1024  # 16kb
 DOWNLOAD_TIMEOUT = 30
 
+RETRYABLE_ERRORS = (
+    errors.DeadlockDetected,
+    errors.LockNotAvailable,
+    errors.ObjectInUse,
+)
+# Retries can only occur in cases where the datastore entry exists,
+# so use the standard timeout
+RETRIED_JOB_TIMEOUT = config.get('ckanext.xloader.job_timeout', '3600')
+
 
 # input = {
 # 'api_key': user['apikey'],
@@ -87,6 +97,19 @@ def xloader_data_into_datastore(input):
         log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
         errored = True
     except Exception as e:
+        if isinstance(e, RETRYABLE_ERRORS):
+            tries = job_dict['metadata'].get('tries', 0)
+            if tries == 0:
+                log.info("Job %s failed due to temporary error [%s], retrying", job_id, e)
+                job_dict['status'] = 'pending'
+                job_dict['metadata']['tries'] = tries + 1
+                enqueue_job(
+                    xloader_data_into_datastore,
+                    [input],
+                    rq_kwargs=dict(timeout=RETRIED_JOB_TIMEOUT)
+                )
+                return None
+
         db.mark_job_as_errored(
             job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
         job_dict['status'] = 'error'
@@ -541,8 +564,7 @@ def __init__(self, task_id, input):
         self.input = input
 
     def emit(self, record):
-        conn = db.ENGINE.connect()
-        try:
+        with db.ENGINE.connect() as conn:
             # Turn strings into unicode to stop SQLAlchemy
             # "Unicode type received non-unicode bind param value" warnings.
             message = str(record.getMessage())
@@ -558,8 +580,6 @@ def emit(self, record):
                 module=module,
                 funcName=funcName,
                 lineno=record.lineno))
-        finally:
-            conn.close()
 
 
 class DatetimeJsonEncoder(json.JSONEncoder):
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -78,6 +78,49 @@ def detect_encoding(file_path):
     return detector.result  # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99}
 
 
+def _fields_match(fields, existing_fields, logger):
+    ''' Check whether all columns have the same names and types as previously,
+    independent of ordering.
+    '''
+    # drop the generated '_id' field
+    for index in range(len(existing_fields)):
+        if existing_fields[index]['id'] == '_id':
+            existing_fields.pop(index)
+            break
+
+    # fail fast if number of fields doesn't match
+    field_count = len(fields)
+    if field_count != len(existing_fields):
+        logger.info("Fields do not match; there are now %s fields but previously %s", field_count, len(existing_fields))
+        return False
+
+    # ensure each field is present in both collections with the same type
+    for index in range(field_count):
+        field_id = fields[index]['id']
+        for existing_index in range(field_count):
+            existing_field_id = existing_fields[existing_index]['id']
+            if field_id == existing_field_id:
+                if fields[index]['type'] == existing_fields[existing_index]['type']:
+                    break
+                else:
+                    logger.info("Fields do not match; new type for %s field is %s but existing type is %s",
+                                field_id, fields[index]["type"], existing_fields[existing_index]['type'])
+                    return False
+        else:
+            logger.info("Fields do not match; no existing entry found for %s", field_id)
+            return False
+    return True
+
+
+def _clear_datastore_resource(resource_id):
+    ''' Delete all records from the datastore table, without dropping the table itself.
+    '''
+    engine = get_write_engine()
+    with engine.begin() as conn:
+        conn.execute("SET LOCAL lock_timeout = '5s'")
+        conn.execute('TRUNCATE TABLE "{}"'.format(resource_id))
+
+
 def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     '''Loads a CSV into DataStore. Does not create the indexes.'''
 
@@ -140,34 +183,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
         existing = datastore_resource_exists(resource_id)
         existing_info = {}
         if existing:
+            existing_fields = existing.get('fields', [])
             existing_info = dict((f['id'], f['info'])
-                                 for f in existing.get('fields', [])
+                                 for f in existing_fields
                                  if 'info' in f)
 
-            '''
-            Delete existing datastore table before proceeding. Otherwise
-            the COPY will append to the existing table. And if
-            the fields have significantly changed, it may also fail.
-            '''
-            logger.info('Deleting "{res_id}" from DataStore.'.format(
-                res_id=resource_id))
-            delete_datastore_resource(resource_id)
-
-        # Columns types are either set (overridden) in the Data Dictionary page
-        # or default to text type (which is robust)
-        fields = [
-            {'id': header_name,
-             'type': existing_info.get(header_name, {})
-                .get('type_override') or 'text',
-             }
-            for header_name in headers]
+            # Column types are either set (overridden) in the Data Dictionary page
+            # or default to text type (which is robust)
+            fields = [
+                {'id': header_name,
+                 'type': existing_info.get(header_name, {})
+                    .get('type_override') or 'text',
+                 }
+                for header_name in headers]
 
-        # Maintain data dictionaries from matching column names
-        if existing_info:
+            # Maintain data dictionaries from matching column names
             for f in fields:
                 if f['id'] in existing_info:
                     f['info'] = existing_info[f['id']]
 
+            '''
+            Delete or truncate existing datastore table before proceeding,
+            depending on whether any fields have changed.
+            Otherwise the COPY will append to the existing table.
+            And if the fields have significantly changed, it may also fail.
+            '''
+            if _fields_match(fields, existing_fields, logger):
+                logger.info('Clearing records for "%s" from DataStore.', resource_id)
+                _clear_datastore_resource(resource_id)
+            else:
+                logger.info('Deleting "%s" from DataStore.', resource_id)
+                delete_datastore_resource(resource_id)
+        else:
+            fields = [
+                {'id': header_name,
+                 'type': 'text'}
+                for header_name in headers]
+
         logger.info('Fields: %s', fields)
 
         # Create table
@@ -281,6 +333,18 @@ def create_column_indexes(fields, resource_id, logger):
     logger.info('...column indexes created.')
 
 
+def _save_type_overrides(headers_dicts):
+    # copy 'type' to 'type_override' if it's not the default type (text)
+    # and there isn't already an override in place
+    for h in headers_dicts:
+        if h['type'] != 'text':
+            if 'info' in h:
+                if 'type_override' not in h['info']:
+                    h['info']['type_override'] = h['type']
+            else:
+                h['info'] = {'type_override': h['type']}
+
+
 def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     '''Loads an Excel file (or other tabular data recognized by tabulator)
     into Datastore and creates indexes.
@@ -311,9 +375,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     existing = datastore_resource_exists(resource_id)
     existing_info = None
     if existing:
+        existing_fields = existing.get('fields', [])
         existing_info = dict(
             (f['id'], f['info'])
-            for f in existing.get('fields', []) if 'info' in f)
+            for f in existing_fields if 'info' in f)
 
     # Some headers might have been converted from strings to floats and such.
     headers = encode_headers(headers)
@@ -349,16 +414,6 @@ def row_iterator():
                 yield data_row
         result = row_iterator()
 
-        '''
-        Delete existing datstore resource before proceeding. Otherwise
-        'datastore_create' will append to the existing datastore. And if
-        the fields have significantly changed, it may also fail.
-        '''
-        if existing:
-            logger.info('Deleting "{res_id}" from datastore.'.format(
-                res_id=resource_id))
-            delete_datastore_resource(resource_id)
-
         headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
                          for field in zip(headers, types)]
 
@@ -372,8 +427,24 @@ def row_iterator():
                     if type_override in list(_TYPE_MAPPING.values()):
                         h['type'] = type_override
 
-        logger.info('Determined headers and types: {headers}'.format(
-            headers=headers_dicts))
+        # preserve any types that we have sniffed unless told otherwise
+        _save_type_overrides(headers_dicts)
+
+        logger.info('Determined headers and types: %s', headers_dicts)
+
+        '''
+        Delete or truncate existing datastore table before proceeding,
+        depending on whether any fields have changed.
+        Otherwise 'datastore_create' will append to the existing datastore.
+        And if the fields have significantly changed, it may also fail.
+        '''
+        if existing:
+            if _fields_match(headers_dicts, existing_fields, logger):
+                logger.info('Clearing records for "%s" from DataStore.', resource_id)
+                _clear_datastore_resource(resource_id)
+            else:
+                logger.info('Deleting "%s" from datastore.', resource_id)
+                delete_datastore_resource(resource_id)
 
         logger.info('Copying to database...')
         count = 0
@@ -382,7 +453,7 @@ def row_iterator():
         non_empty_types = ['timestamp', 'numeric']
         for i, records in enumerate(chunky(result, 250)):
             count += len(records)
-            logger.info('Saving chunk {number}'.format(number=i))
+            logger.info('Saving chunk %s', i)
             for row in records:
                 for column_index, column_name in enumerate(row):
                     if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '':
@@ -391,8 +462,7 @@ def row_iterator():
         logger.info('...copying done')
 
     if count:
-        logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
-                    n=count, res_id=resource_id))
+        logger.info('Successfully pushed %s entries to "%s".', count, resource_id)
     else:
         # no datastore table is created
         raise LoaderError('No entries found - nothing to load')
diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -95,6 +95,9 @@ def notify(self, entity, operation):
         if _should_remove_unsupported_resource_from_datastore(resource_dict):
             toolkit.enqueue_job(fn=_remove_unsupported_resource_from_datastore, args=[entity.id])
 
+        if not getattr(entity, 'url_changed', False):
+            return
+
         self._submit_to_xloader(resource_dict)
 
     # IResourceController
diff --git a/ckanext/xloader/templates/xloader/confirm_datastore_delete.html b/ckanext/xloader/templates/xloader/confirm_datastore_delete.html
@@ -0,0 +1,22 @@
+{% extends "page.html" %}
+
+{% block subtitle %}{{ _("Confirm Delete") }}{% endblock %}
+
+{% block maintag %}<div class="row" role="main">{% endblock %}
+
+{% block main_content %}
+  <section class="module col-md-6 col-md-offset-3">
+    <div class="module-content">
+      {% block form %}
+        <p>{{ _('Are you sure you want to delete the DataStore and Data Dictionary?') }}</p>
+        <p class="form-actions">
+          <form action="{{ h.url_for('xloader.delete_datastore_table', id=package_id, resource_id=resource_id) }}" method="post">
+            {{ h.csrf_input() if 'csrf_input' in h }}
+            <button class="btn btn-danger" type="submit" name="cancel" >{{ _('Cancel') }}</button>
+            <button class="btn btn-primary" type="submit" name="delete" >{{ _('Confirm Delete') }}</button>
+          </form>
+        </p>
+      {% endblock %}
+    </div>
+  </section>
+{% endblock %}
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
@@ -4,11 +4,26 @@
 
 {% block primary_content_inner %}
 
-  {% set action = h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) %}
   {% set show_table = true %}
 
+  {% block delete_ds_button %}
+    {% if res.datastore_active %}
+      {% set delete_action = h.url_for('xloader.delete_datastore_table', id=pkg.id, resource_id=res.id) %}
+      <form method="post" action="{{ delete_action }}" class="mb-3 d-inline-block pull-right">
+        {{ h.csrf_input() if 'csrf_input' in h }}
+        <a href="{{ delete_action }}"
+          class="btn btn-danger pull-left"
+          type="submit"
+          data-module="confirm-action"
+          data-module-with-data=true
+          data-module-content="{{ _('Are you sure you want to delete the DataStore and Data Dictionary?') }}"
+          >{% block delete_datastore_button_text %}<i class="fa fa-remove"></i> {{ _('Delete from DataStore') }}{% endblock %}</a>
+      </form>
+    {% endif %}
+  {% endblock %}
+
   {% block upload_ds_button %}
-    <form method="post" action="{{ action }}" class="datapusher-form">
+    <form method="post" action="{{ h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) }}" class="datapusher-form mb-3 d-inline-block">
       {{ h.csrf_input() if 'csrf_input' in h }}
       <button class="btn btn-primary" name="save" type="submit">
         <i class="fa fa-cloud-upload"></i> {{ _('Upload to DataStore') }}
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -949,6 +949,14 @@ def test_simple(self, Session):
             u"numeric",
             u"text",
         ]
+        # Check that the sniffed types have been recorded as overrides
+        rec = p.toolkit.get_action("datastore_search")(
+            None, {"resource_id": resource_id, "limit": 0}
+        )
+        fields = [f for f in rec["fields"] if not f["id"].startswith("_")]
+        assert fields[0].get("info", {}).get("type_override", "") == "timestamp"
+        assert fields[1].get("info", {}).get("type_override", "") == "numeric"
+        assert fields[2].get("info", {}).get("type_override", "") == ""
 
     # test disabled by default to avoid adding large file to repo and slow test
     @pytest.mark.skip
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
@@ -113,6 +113,7 @@ def set_resource_metadata(update_dict):
     # better fix
 
     q = model.Session.query(model.Resource). \
+        with_for_update(of=model.Resource). \
         filter(model.Resource.id == update_dict['resource_id'])
     resource = q.one()
 
diff --git a/ckanext/xloader/views.py b/ckanext/xloader/views.py