Skip to content

Commit b483cce

Browse files
committed
Merge branch 'master-queensland' into fix/qld-gov-au/stringish-type-guessing
# Conflicts: # ckanext/xloader/config_declaration.yaml ### RESOLVED.
2 parents 080ea1f + 3a47541 commit b483cce

29 files changed

+1072
-291
lines changed

.flake8

-4
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,4 @@ max-line-length=127
1717

1818
# List ignore rules one per line.
1919
ignore =
20-
E501
21-
C901
2220
W503
23-
F401
24-
F403

.github/dependabot.yml

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
version: 2
2+
registries:
3+
python-index-pypi-org:
4+
type: python-index
5+
url: https://pypi.org/
6+
replaces-base: true
7+
username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}"
8+
password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}"
9+
10+
updates:
11+
- package-ecosystem: pip
12+
directory: "/"
13+
schedule:
14+
interval: daily
15+
time: "19:00"
16+
open-pull-requests-limit: 10
17+
registries:
18+
- python-index-pypi-org

ckanext/xloader/action.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,12 @@ def xloader_submit(context, data_dict):
152152
'original_url': resource_dict.get('url'),
153153
}
154154
}
155-
timeout = config.get('ckanext.xloader.job_timeout', '3600')
155+
# Expand timeout for resources that have to be type-guessed
156+
timeout = config.get(
157+
'ckanext.xloader.job_timeout',
158+
'3600' if utils.datastore_resource_exists(res_id) else '10800')
159+
log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)
160+
156161
try:
157162
job = enqueue_job(
158163
jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)

ckanext/xloader/command.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
import logging
55
import ckan.plugins.toolkit as tk
6+
from ckanext.xloader.utils import XLoaderFormats
67

78

89
class XloaderCmd:
@@ -84,8 +85,6 @@ def _submit_resource(self, resource, user, indent=0):
8485
'''resource: resource dictionary
8586
'''
8687
indentation = ' ' * indent
87-
# import here, so that that loggers are setup
88-
from ckanext.xloader.plugin import XLoaderFormats
8988

9089
if not XLoaderFormats.is_it_an_xloader_format(resource['format']):
9190
print(indentation

ckanext/xloader/config_declaration.yaml

+9-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,7 @@ groups:
2929
default: 1_000_000_000
3030
example: 100000
3131
description: |
32-
The connection string for the jobs database used by XLoader. The
33-
default of an sqlite file is fine for development. For production use a
34-
Postgresql database.
32+
The maximum file size that XLoader will attempt to load.
3533
type: int
3634
required: false
3735
- key: ckanext.xloader.use_type_guessing
@@ -55,6 +53,14 @@ groups:
5553
Use with ckanext.xloader.use_type_guessing to set strict true or false
5654
for type guessing. If set to False, the types will always fallback to string type.
5755
type: bool
56+
- key: ckanext.xloader.max_type_guessing_length
57+
default: 0
58+
example: 100000
59+
description: |
60+
The maximum file size that will be passed to Tabulator if the
61+
use_type_guessing flag is enabled. Larger files will use COPY even if
62+
the flag is set. Defaults to 1/10 of the maximum content length.
63+
type: int
5864
required: false
5965
- key: ckanext.xloader.parse_dates_dayfirst
6066
default: False

ckanext/xloader/helpers.py

+19
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import ckan.plugins.toolkit as toolkit
2+
from ckanext.xloader.utils import XLoaderFormats
23

34

45
def xloader_status(resource_id):
@@ -25,3 +26,21 @@ def xloader_status_description(status):
2526
return captions.get(status['status'], status['status'].capitalize())
2627
else:
2728
return _('Not Uploaded Yet')
29+
30+
31+
def is_resource_supported_by_xloader(res_dict, check_access=True):
32+
is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format'))
33+
is_datastore_active = res_dict.get('datastore_active', False)
34+
if check_access:
35+
user_has_access = toolkit.h.check_access('package_update', {'id': res_dict.get('package_id')})
36+
else:
37+
user_has_access = True
38+
url_type = res_dict.get('url_type')
39+
if url_type:
40+
try:
41+
is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types()
42+
except AttributeError:
43+
is_supported_url_type = (url_type == 'upload')
44+
else:
45+
is_supported_url_type = True
46+
return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type

ckanext/xloader/jobs.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import tempfile
88
import json
99
import datetime
10+
import os
1011
import traceback
1112
import sys
1213

@@ -16,23 +17,26 @@
1617
import sqlalchemy as sa
1718

1819
from ckan import model
19-
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version
20+
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
2021

21-
from . import loader
22-
from . import db
22+
from . import db, loader
2323
from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
24-
from .utils import set_resource_metadata
24+
from .utils import datastore_resource_exists, set_resource_metadata
2525

2626
try:
2727
from ckan.lib.api_token import get_user_from_token
2828
except ImportError:
2929
get_user_from_token = None
3030

31+
log = logging.getLogger(__name__)
32+
3133
SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True))
3234
if not SSL_VERIFY:
3335
requests.packages.urllib3.disable_warnings()
3436

3537
MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
38+
# Don't try Tabulator load on large files
39+
MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10)
3640
MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
3741
CHUNK_SIZE = 16 * 1024 # 16kb
3842
DOWNLOAD_TIMEOUT = 30
@@ -80,15 +84,13 @@ def xloader_data_into_datastore(input):
8084
db.mark_job_as_errored(job_id, str(e))
8185
job_dict['status'] = 'error'
8286
job_dict['error'] = str(e)
83-
log = logging.getLogger(__name__)
8487
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
8588
errored = True
8689
except Exception as e:
8790
db.mark_job_as_errored(
8891
job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
8992
job_dict['status'] = 'error'
9093
job_dict['error'] = str(e)
91-
log = logging.getLogger(__name__)
9294
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
9395
errored = True
9496
finally:
@@ -206,11 +208,12 @@ def tabulator_load():
206208
logger.info('Loading CSV')
207209
# If ckanext.xloader.use_type_guessing is not configured, fall back to
208210
# deprecated ckanext.xloader.just_load_with_messytables
209-
use_type_guessing = asbool(config.get(
210-
'ckanext.xloader.use_type_guessing', config.get(
211-
'ckanext.xloader.just_load_with_messytables', False)))
212-
logger.info("'use_type_guessing' mode is: %s",
213-
use_type_guessing)
211+
use_type_guessing = asbool(
212+
config.get('ckanext.xloader.use_type_guessing', config.get(
213+
'ckanext.xloader.just_load_with_messytables', False))) \
214+
and not datastore_resource_exists(resource['id']) \
215+
and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
216+
logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
214217
try:
215218
if use_type_guessing:
216219
tabulator_load()

0 commit comments

Comments
 (0)