|
7 | 7 | import tempfile
|
8 | 8 | import json
|
9 | 9 | import datetime
|
| 10 | +import os |
10 | 11 | import traceback
|
11 | 12 | import sys
|
12 | 13 |
|
|
16 | 17 | import sqlalchemy as sa
|
17 | 18 |
|
18 | 19 | from ckan import model
|
19 |
| -from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version |
| 20 | +from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config |
20 | 21 |
|
21 |
| -from . import loader |
22 |
| -from . import db |
| 22 | +from . import db, loader |
23 | 23 | from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
|
24 |
| -from .utils import set_resource_metadata |
| 24 | +from .utils import datastore_resource_exists, set_resource_metadata |
25 | 25 |
|
26 | 26 | try:
|
27 | 27 | from ckan.lib.api_token import get_user_from_token
|
28 | 28 | except ImportError:
|
29 | 29 | get_user_from_token = None
|
30 | 30 |
|
| 31 | +log = logging.getLogger(__name__) |
| 32 | + |
31 | 33 | SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True))
|
32 | 34 | if not SSL_VERIFY:
|
33 | 35 | requests.packages.urllib3.disable_warnings()
|
34 | 36 |
|
35 | 37 | MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
|
| 38 | +# Don't try Tabulator load on large files |
| 39 | +MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10) |
36 | 40 | MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
|
37 | 41 | CHUNK_SIZE = 16 * 1024 # 16kb
|
38 | 42 | DOWNLOAD_TIMEOUT = 30
|
@@ -80,15 +84,13 @@ def xloader_data_into_datastore(input):
|
80 | 84 | db.mark_job_as_errored(job_id, str(e))
|
81 | 85 | job_dict['status'] = 'error'
|
82 | 86 | job_dict['error'] = str(e)
|
83 |
| - log = logging.getLogger(__name__) |
84 | 87 | log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
|
85 | 88 | errored = True
|
86 | 89 | except Exception as e:
|
87 | 90 | db.mark_job_as_errored(
|
88 | 91 | job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
|
89 | 92 | job_dict['status'] = 'error'
|
90 | 93 | job_dict['error'] = str(e)
|
91 |
| - log = logging.getLogger(__name__) |
92 | 94 | log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
|
93 | 95 | errored = True
|
94 | 96 | finally:
|
@@ -206,11 +208,12 @@ def tabulator_load():
|
206 | 208 | logger.info('Loading CSV')
|
207 | 209 | # If ckanext.xloader.use_type_guessing is not configured, fall back to
|
208 | 210 | # deprecated ckanext.xloader.just_load_with_messytables
|
209 |
| - use_type_guessing = asbool(config.get( |
210 |
| - 'ckanext.xloader.use_type_guessing', config.get( |
211 |
| - 'ckanext.xloader.just_load_with_messytables', False))) |
212 |
| - logger.info("'use_type_guessing' mode is: %s", |
213 |
| - use_type_guessing) |
| 211 | + use_type_guessing = asbool( |
| 212 | + config.get('ckanext.xloader.use_type_guessing', config.get( |
| 213 | + 'ckanext.xloader.just_load_with_messytables', False))) \ |
| 214 | + and not datastore_resource_exists(resource['id']) \ |
| 215 | + and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH |
| 216 | + logger.info("'use_type_guessing' mode is: %s", use_type_guessing) |
214 | 217 | try:
|
215 | 218 | if use_type_guessing:
|
216 | 219 | tabulator_load()
|
|
0 commit comments