JVickery-TBS
diff --git a/‎.flake8
-4 b/‎.flake8
-4
diff --git a/‎.github/dependabot.yml
+18 b/‎.github/dependabot.yml
+18
diff --git a/‎ckanext/xloader/action.py
+6-1 b/‎ckanext/xloader/action.py
+6-1
diff --git a/‎ckanext/xloader/command.py
+1-2 b/‎ckanext/xloader/command.py
+1-2
diff --git a/‎ckanext/xloader/config_declaration.yaml
+9-3 b/‎ckanext/xloader/config_declaration.yaml
+9-3
diff --git a/‎ckanext/xloader/helpers.py
+19 b/‎ckanext/xloader/helpers.py
+19
diff --git a/‎ckanext/xloader/jobs.py
+14-11 b/‎ckanext/xloader/jobs.py
+14-11
@@ -17,8 +17,4 @@ max-line-length=127
 
 # List ignore rules one per line.
 ignore =
-    E501
-    C901
     W503
-    F401
-    F403
@@ -0,0 +1,18 @@
+version: 2
+registries:
+  python-index-pypi-org:
+    type: python-index
+    url: https://pypi.org/
+    replaces-base: true
+    username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}"
+    password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}"
+
+updates:
+- package-ecosystem: pip
+  directory: "/"
+  schedule:
+    interval: daily
+    time: "19:00"
+  open-pull-requests-limit: 10
+  registries:
+  - python-index-pypi-org
@@ -152,7 +152,12 @@ def xloader_submit(context, data_dict):
             'original_url': resource_dict.get('url'),
         }
     }
-    timeout = config.get('ckanext.xloader.job_timeout', '3600')
+    # Expand timeout for resources that have to be type-guessed
+    timeout = config.get(
+        'ckanext.xloader.job_timeout',
+        '3600' if utils.datastore_resource_exists(res_id) else '10800')
+    log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)
+
     try:
         job = enqueue_job(
             jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)
 
@@ -3,6 +3,7 @@
 import sys
 import logging
 import ckan.plugins.toolkit as tk
+from ckanext.xloader.utils import XLoaderFormats
 
 
 class XloaderCmd:
@@ -84,8 +85,6 @@ def _submit_resource(self, resource, user, indent=0):
         '''resource: resource dictionary
         '''
         indentation = ' ' * indent
-        # import here, so that that loggers are setup
-        from ckanext.xloader.plugin import XLoaderFormats
 
         if not XLoaderFormats.is_it_an_xloader_format(resource['format']):
             print(indentation
 
@@ -29,9 +29,7 @@ groups:
         default: 1_000_000_000
         example: 100000
         description: |
-            The connection string for the jobs database used by XLoader. The
-            default of an sqlite file is fine for development. For production use a
-            Postgresql database.
+            The maximum file size that XLoader will attempt to load.
         type: int
         required: false
       - key: ckanext.xloader.use_type_guessing
@@ -55,6 +53,14 @@ groups:
             Use with ckanext.xloader.use_type_guessing to set strict true or false
             for type guessing. If set to False, the types will always fallback to string type.
         type: bool
+      - key: ckanext.xloader.max_type_guessing_length
+        default: 0
+        example: 100000
+        description: |
+            The maximum file size that will be passed to Tabulator if the
+            use_type_guessing flag is enabled. Larger files will use COPY even if
+            the flag is set. Defaults to 1/10 of the maximum content length.
+        type: int
         required: false
       - key: ckanext.xloader.parse_dates_dayfirst
         default: False
 
@@ -1,4 +1,5 @@
 import ckan.plugins.toolkit as toolkit
+from ckanext.xloader.utils import XLoaderFormats
 
 
 def xloader_status(resource_id):
@@ -25,3 +26,21 @@ def xloader_status_description(status):
         return captions.get(status['status'], status['status'].capitalize())
     else:
         return _('Not Uploaded Yet')
+
+
+def is_resource_supported_by_xloader(res_dict, check_access=True):
+    is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format'))
+    is_datastore_active = res_dict.get('datastore_active', False)
+    if check_access:
+        user_has_access = toolkit.h.check_access('package_update', {'id': res_dict.get('package_id')})
+    else:
+        user_has_access = True
+    url_type = res_dict.get('url_type')
+    if url_type:
+        try:
+            is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types()
+        except AttributeError:
+            is_supported_url_type = (url_type == 'upload')
+    else:
+        is_supported_url_type = True
+    return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type
@@ -7,6 +7,7 @@
 import tempfile
 import json
 import datetime
+import os
 import traceback
 import sys
 
@@ -16,23 +17,26 @@
 import sqlalchemy as sa
 
 from ckan import model
-from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version
+from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
 
-from . import loader
-from . import db
+from . import db, loader
 from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
-from .utils import set_resource_metadata
+from .utils import datastore_resource_exists, set_resource_metadata
 
 try:
     from ckan.lib.api_token import get_user_from_token
 except ImportError:
     get_user_from_token = None
 
+log = logging.getLogger(__name__)
+
 SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True))
 if not SSL_VERIFY:
     requests.packages.urllib3.disable_warnings()
 
 MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
+# Don't try Tabulator load on large files
+MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10)
 MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
 CHUNK_SIZE = 16 * 1024  # 16kb
 DOWNLOAD_TIMEOUT = 30
@@ -80,15 +84,13 @@ def xloader_data_into_datastore(input):
         db.mark_job_as_errored(job_id, str(e))
         job_dict['status'] = 'error'
         job_dict['error'] = str(e)
-        log = logging.getLogger(__name__)
         log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
         errored = True
     except Exception as e:
         db.mark_job_as_errored(
             job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
         job_dict['status'] = 'error'
         job_dict['error'] = str(e)
-        log = logging.getLogger(__name__)
         log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
         errored = True
     finally:
@@ -206,11 +208,12 @@ def tabulator_load():
     logger.info('Loading CSV')
     # If ckanext.xloader.use_type_guessing is not configured, fall back to
     # deprecated ckanext.xloader.just_load_with_messytables
-    use_type_guessing = asbool(config.get(
-        'ckanext.xloader.use_type_guessing', config.get(
-            'ckanext.xloader.just_load_with_messytables', False)))
-    logger.info("'use_type_guessing' mode is: %s",
-                use_type_guessing)
+    use_type_guessing = asbool(
+        config.get('ckanext.xloader.use_type_guessing', config.get(
+            'ckanext.xloader.just_load_with_messytables', False))) \
+        and not datastore_resource_exists(resource['id']) \
+        and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
+    logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
     try:
         if use_type_guessing:
             tabulator_load()
Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,12 @@ def xloader_submit(context, data_dict):`
`152`	`152`	`'original_url': resource_dict.get('url'),`
`153`	`153`	`}`
`154`	`154`	`}`
`155`		`- timeout = config.get('ckanext.xloader.job_timeout', '3600')`
	`155`	`+ # Expand timeout for resources that have to be type-guessed`
	`156`	`+ timeout = config.get(`
	`157`	`+ 'ckanext.xloader.job_timeout',`
	`158`	`+ '3600' if utils.datastore_resource_exists(res_id) else '10800')`
	`159`	`+ log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)`
	`160`	`+`
`156`	`161`	`try:`
`157`	`162`	`job = enqueue_job(`
`158`	`163`	`jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)`