From 2b2d0cfdca4724d065457a3d4c1a14e610cbf474 Mon Sep 17 00:00:00 2001 From: ranafayed Date: Mon, 12 Oct 2020 16:49:41 +0200 Subject: [PATCH] handle 'surt_ordered: False' --- pywb/warcserver/index/aggregator.py | 4 +++- pywb/warcserver/index/query.py | 4 +++- pywb/warcserver/warcserver.py | 12 ++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 032ad1d05..79f447122 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -33,6 +33,7 @@ def __call__(self, params): if content_type: params['filter'] = '=mime:' + content_type + params.update({'surt_ordered': self.surt_ordered}) query = CDXQuery(params) cdx_iter, errs = self.load_index(query.params) @@ -115,10 +116,11 @@ def get_source_list(self, params): #============================================================================= class BaseSourceListAggregator(BaseAggregator): - def __init__(self, sources, **kwargs): + def __init__(self, sources, surt_ordered, **kwargs): self.sources = sources self.sources_key = kwargs.get('sources_key', 'sources') self.invert_sources = kwargs.get('invert_sources', False) + self.surt_ordered = surt_ordered def get_all_sources(self, params): return self.sources diff --git a/pywb/warcserver/index/query.py b/pywb/warcserver/index/query.py index 2d7fb670e..38391204a 100644 --- a/pywb/warcserver/index/query.py +++ b/pywb/warcserver/index/query.py @@ -10,6 +10,8 @@ def __init__(self, params): self.params = params alt_url = self.params.get('alt_url') url = alt_url or self.url + surt_ordered = self.params.get('surt_ordered') + if not self.params.get('matchType'): if url.startswith('*.'): url = self.params['url'] = url[2:] @@ -23,7 +25,7 @@ def __init__(self, params): if alt_url: self.params['alt_url'] = url - start, end = calc_search_range(url=url, + start, end = calc_search_range(url=url, surt_ordered=surt_ordered, match_type=self.params['matchType'], url_canon=self.params.get('_url_canon')) diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index d417c0faf..a5d401b6d 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -176,13 +176,13 @@ def load_coll(self, name, coll_config): archive_paths = coll_config.get('archive_paths') acl_paths = coll_config.get('acl_paths') default_access = coll_config.get('default_access', self.default_access) - + surt_ordered = coll_config.get('surt_ordered', True) else: raise Exception('collection config must be string or dict') # INDEX CONFIG if index: - agg = init_index_agg({name: index}) + agg = init_index_agg({name: index}, surt_ordered=surt_ordered) else: if not isinstance(coll_config, dict): raise Exception('collection config missing') @@ -196,7 +196,7 @@ def load_coll(self, name, coll_config): raise Exception('no index, index_group or sequence found') timeout = int(coll_config.get('timeout', 0)) - agg = init_index_agg(index_group, True, timeout) + agg = init_index_agg(index_group, True, timeout, surt_ordered=surt_ordered) # ARCHIVE CONFIG if not archive_paths: @@ -257,14 +257,14 @@ def register_source(source_cls, end=False): # ============================================================================ -def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None): +def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None, surt_ordered=True): sources = {} for n, v in iteritems(source_configs): sources[n] = init_index_source(v, source_list=source_list) if use_gevent: - return GeventTimeoutAggregator(sources, timeout=timeout) + return GeventTimeoutAggregator(sources, timeout=timeout, surt_ordered=surt_ordered) else: - return SimpleAggregator(sources) + return SimpleAggregator(sources, surt_ordered=surt_ordered)