Skip to content

Commit 5c07ba4

Browse files
committed
Merge branch 'master' into feature/validation-support
# Conflicts: # ckanext/xloader/plugin.py # ckanext/xloader/utils.py ### RESOLVED.
2 parents e888153 + 2b26209 commit 5c07ba4

28 files changed

+865
-293
lines changed

.github/dependabot.yml

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
version: 2
2+
registries:
3+
python-index-pypi-org:
4+
type: python-index
5+
url: https://pypi.org/
6+
replaces-base: true
7+
username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}"
8+
password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}"
9+
10+
updates:
11+
- package-ecosystem: pip
12+
directory: "/"
13+
schedule:
14+
interval: daily
15+
time: "19:00"
16+
open-pull-requests-limit: 10
17+
registries:
18+
- python-index-pypi-org

README.rst

+6
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,12 @@ This setting is shared with other plugins that download resource files, such as
196196

197197
ckan.download_proxy = http://my-proxy:1234/
198198

199+
You may also wish to configure the database to use your preferred date input style on COPY.
200+
For example, to make [PostgreSQL](https://www.postgresql.org/docs/current/runtime-config-client.html#RUNTIME-CONFIG-CLIENT-FORMAT)
201+
expect European (day-first) dates, you could add to ``postgresql.conf``:
202+
203+
datestyle=ISO,DMY
204+
199205
------------------------
200206
Developer installation
201207
------------------------

ckanext/xloader/action.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,17 @@ def xloader_submit(context, data_dict):
152152
'original_url': resource_dict.get('url'),
153153
}
154154
}
155-
timeout = config.get('ckanext.xloader.job_timeout', '3600')
155+
# Expand timeout for resources that have to be type-guessed
156+
timeout = config.get(
157+
'ckanext.xloader.job_timeout',
158+
'3600' if utils.datastore_resource_exists(res_id) else '10800')
159+
log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)
160+
156161
try:
157162
job = enqueue_job(
158-
jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)
163+
jobs.xloader_data_into_datastore, [data],
164+
title="xloader_submit: package: {} resource: {}".format(resource_dict.get('package_id'), res_id),
165+
rq_kwargs=dict(timeout=timeout)
159166
)
160167
except Exception:
161168
log.exception('Unable to enqueued xloader res_id=%s', res_id)

ckanext/xloader/command.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
import logging
55
import ckan.plugins.toolkit as tk
6+
from ckanext.xloader.utils import XLoaderFormats
67

78

89
class XloaderCmd:
@@ -84,8 +85,6 @@ def _submit_resource(self, resource, user, indent=0):
8485
'''resource: resource dictionary
8586
'''
8687
indentation = ' ' * indent
87-
# import here, so that that loggers are setup
88-
from ckanext.xloader.plugin import XLoaderFormats
8988

9089
if not XLoaderFormats.is_it_an_xloader_format(resource['format']):
9190
print(indentation

ckanext/xloader/config_declaration.yaml

+10-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,7 @@ groups:
2929
default: 1_000_000_000
3030
example: 100000
3131
description: |
32-
The connection string for the jobs database used by XLoader. The
33-
default of an sqlite file is fine for development. For production use a
34-
Postgresql database.
32+
The maximum file size that XLoader will attempt to load.
3533
type: int
3634
required: false
3735
- key: ckanext.xloader.use_type_guessing
@@ -48,6 +46,15 @@ groups:
4846
type: bool
4947
required: false
5048
legacy_key: ckanext.xloader.just_load_with_messytables
49+
- key: ckanext.xloader.max_type_guessing_length
50+
default: 0
51+
example: 100000
52+
description: |
53+
The maximum file size that will be passed to Tabulator if the
54+
use_type_guessing flag is enabled. Larger files will use COPY even if
55+
the flag is set. Defaults to 1/10 of the maximum content length.
56+
type: int
57+
required: false
5158
- key: ckanext.xloader.parse_dates_dayfirst
5259
default: False
5360
example: False

ckanext/xloader/db.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,7 @@ def add_pending_job(job_id, job_type, api_key,
191191
if not metadata:
192192
metadata = {}
193193

194-
conn = ENGINE.connect()
195-
trans = conn.begin()
196-
try:
194+
with ENGINE.begin() as conn:
197195
conn.execute(JOBS_TABLE.insert().values(
198196
job_id=job_id,
199197
job_type=job_type,
@@ -225,12 +223,6 @@ def add_pending_job(job_id, job_type, api_key,
225223
)
226224
if inserts:
227225
conn.execute(METADATA_TABLE.insert(), inserts)
228-
trans.commit()
229-
except Exception:
230-
trans.rollback()
231-
raise
232-
finally:
233-
conn.close()
234226

235227

236228
class InvalidErrorObjectError(Exception):

ckanext/xloader/helpers.py

+17
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import ckan.plugins.toolkit as toolkit
2+
from ckanext.xloader.utils import XLoaderFormats
23

34

45
def xloader_status(resource_id):
@@ -25,3 +26,19 @@ def xloader_status_description(status):
2526
return captions.get(status['status'], status['status'].capitalize())
2627
else:
2728
return _('Not Uploaded Yet')
29+
30+
31+
def is_resource_supported_by_xloader(res_dict, check_access=True):
32+
is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format'))
33+
is_datastore_active = res_dict.get('datastore_active', False)
34+
user_has_access = not check_access or toolkit.h.check_access('package_update',
35+
{'id':res_dict.get('package_id')})
36+
url_type = res_dict.get('url_type')
37+
if url_type:
38+
try:
39+
is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types()
40+
except AttributeError:
41+
is_supported_url_type = (url_type == 'upload')
42+
else:
43+
is_supported_url_type = True
44+
return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type

ckanext/xloader/jobs.py

+42-17
Original file line numberDiff line numberDiff line change
@@ -7,36 +7,49 @@
77
import tempfile
88
import json
99
import datetime
10+
import os
1011
import traceback
1112
import sys
1213

14+
from psycopg2 import errors
1315
from six.moves.urllib.parse import urlsplit
1416
import requests
1517
from rq import get_current_job
1618
import sqlalchemy as sa
1719

1820
from ckan import model
19-
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
21+
from ckan.plugins.toolkit import get_action, asbool, enqueue_job, ObjectNotFound, config
2022

21-
from . import loader
22-
from . import db
23+
from . import db, loader
2324
from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
24-
from .utils import set_resource_metadata
25+
from .utils import datastore_resource_exists, set_resource_metadata
2526

2627
try:
2728
from ckan.lib.api_token import get_user_from_token
2829
except ImportError:
2930
get_user_from_token = None
3031

32+
log = logging.getLogger(__name__)
33+
3134
SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True))
3235
if not SSL_VERIFY:
3336
requests.packages.urllib3.disable_warnings()
3437

3538
MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
39+
# Don't try Tabulator load on large files
40+
MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10)
3641
MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
3742
CHUNK_SIZE = 16 * 1024 # 16kb
3843
DOWNLOAD_TIMEOUT = 30
3944

45+
MAX_RETRIES = 1
46+
RETRYABLE_ERRORS = (
47+
errors.DeadlockDetected,
48+
errors.LockNotAvailable,
49+
errors.ObjectInUse,
50+
)
51+
RETRIED_JOB_TIMEOUT = config.get('ckanext.xloader.job_timeout', '3600')
52+
4053

4154
# input = {
4255
# 'api_key': user['apikey'],
@@ -80,16 +93,30 @@ def xloader_data_into_datastore(input):
8093
db.mark_job_as_errored(job_id, str(e))
8194
job_dict['status'] = 'error'
8295
job_dict['error'] = str(e)
83-
log = logging.getLogger(__name__)
84-
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
96+
log.error('xloader error: %s, %s', e, traceback.format_exc())
8597
errored = True
8698
except Exception as e:
99+
if isinstance(e, RETRYABLE_ERRORS):
100+
tries = job_dict['metadata'].get('tries', 0)
101+
if tries < MAX_RETRIES:
102+
tries = tries + 1
103+
log.info("Job %s failed due to temporary error [%s], retrying", job_id, e)
104+
job_dict['status'] = 'pending'
105+
job_dict['metadata']['tries'] = tries
106+
enqueue_job(
107+
xloader_data_into_datastore,
108+
[input],
109+
title="retry xloader_data_into_datastore: resource: {} attempt {}".format(
110+
job_dict['metadata']['resource_id'], tries),
111+
rq_kwargs=dict(timeout=RETRIED_JOB_TIMEOUT)
112+
)
113+
return None
114+
87115
db.mark_job_as_errored(
88116
job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
89117
job_dict['status'] = 'error'
90118
job_dict['error'] = str(e)
91-
log = logging.getLogger(__name__)
92-
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
119+
log.error('xloader error: %s, %s', e, traceback.format_exc())
93120
errored = True
94121
finally:
95122
# job_dict is defined in xloader_hook's docstring
@@ -206,11 +233,12 @@ def tabulator_load():
206233
logger.info('Loading CSV')
207234
# If ckanext.xloader.use_type_guessing is not configured, fall back to
208235
# deprecated ckanext.xloader.just_load_with_messytables
209-
use_type_guessing = asbool(config.get(
210-
'ckanext.xloader.use_type_guessing', config.get(
211-
'ckanext.xloader.just_load_with_messytables', False)))
212-
logger.info("'use_type_guessing' mode is: %s",
213-
use_type_guessing)
236+
use_type_guessing = asbool(
237+
config.get('ckanext.xloader.use_type_guessing', config.get(
238+
'ckanext.xloader.just_load_with_messytables', False))) \
239+
and not datastore_resource_exists(resource['id']) \
240+
and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
241+
logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
214242
try:
215243
if use_type_guessing:
216244
tabulator_load()
@@ -538,8 +566,7 @@ def __init__(self, task_id, input):
538566
self.input = input
539567

540568
def emit(self, record):
541-
conn = db.ENGINE.connect()
542-
try:
569+
with db.ENGINE.connect() as conn:
543570
# Turn strings into unicode to stop SQLAlchemy
544571
# "Unicode type received non-unicode bind param value" warnings.
545572
message = str(record.getMessage())
@@ -555,8 +582,6 @@ def emit(self, record):
555582
module=module,
556583
funcName=funcName,
557584
lineno=record.lineno))
558-
finally:
559-
conn.close()
560585

561586

562587
class DatetimeJsonEncoder(json.JSONEncoder):

0 commit comments

Comments
 (0)