Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
alexwaeseperlman committed May 9, 2021
1 parent bded25b commit da6f442
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
embeddings/*
data/*
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Exhaustive ASReview model search
This repository allows a user to test all combinations of selected asreview model types and strategies.

## Usage
Start by running `pip install asreview[all]` to install the required packages. Then run `python test-models.py -h` to see how to run a search. It should look something like this:
```
usage: test-models.py [-h] [-o OUTPUT] [-c CLASSIFIERS [CLASSIFIERS ...]] [-q QUERY [QUERY ...]] [-b BALANCE [BALANCE ...]]
[-f FEATURE_EXTRACTION [FEATURE_EXTRACTION ...]] [-p PRIOR [PRIOR ...]] [-n N_INSTANCES] [-P PRESET]
[-s SEED]
filename
Test many different asreview models out on one dataset to see which perform the best
positional arguments:
filename Path to a labelled csv of abstracts. It should have four columns labelled: "Title", "Abstract",
"Authors", "Included".
optional arguments:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Path to a directory to output the results into. It is created if necessary
-c CLASSIFIERS [CLASSIFIERS ...], --classifiers CLASSIFIERS [CLASSIFIERS ...]
List of classifiers that will be tested. The accepted options are: logistic, lstm-base, lstm-pool,
nb, nn-2-layer, rf, svm
-q QUERY [QUERY ...], --query QUERY [QUERY ...]
List of query strategies that will be tested. The accepted options are: cluster, max, random,
uncertainty
-b BALANCE [BALANCE ...], --balance BALANCE [BALANCE ...]
List of balancing strategies that will be tested. The accepted options are: double, simple, triple,
undersample
-f FEATURE_EXTRACTION [FEATURE_EXTRACTION ...], --feature_extraction FEATURE_EXTRACTION [FEATURE_EXTRACTION ...]
List of feature extraction models that will be tested. The accepted options are: doc2vec, embedding-
idf, embedding-lstm, sbert, tfidf
-p PRIOR [PRIOR ...], --prior PRIOR [PRIOR ...]
List of the number of prelabelled papers to include formatted like: prior_included,prior_excluded.
For example the input could look like --prior 1,1 5,5 5,10
-n N_INSTANCES, --n_instances N_INSTANCES
The number of iterations per test
-P PRESET, --preset PRESET
The name of the preset test to use. Valid options are: default
-s SEED, --seed SEED The random seed for reproducibility.
```

150 changes: 150 additions & 0 deletions test-models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import argparse
import itertools
import asreview
from urllib.request import urlretrieve
import os
import zipfile
import sys
import time
import asreview.review
import asreview.analysis

presets = {
'default': {
'classifiers': ['logistic', 'nb', 'nn-2-layer', 'rf', 'svm'],
'query': ['max', 'cluster'],
'balance': ['simple'],
'feature_extraction': ['tfidf', 'sbert', 'doc2vec'],
'prior': ['1,1', '5,5', '10,10'],
'n_instances': 1
}
}

print(asreview.models.query.list_query_strategies())
parser = argparse.ArgumentParser(description='Test many different asreview models out on one dataset to see which perform the best')
parser.add_argument('filename', type=str, help='Path to a labelled csv of abstracts. It should have four columns labelled: "Title", "Abstract", "Authors", "Included".')

parser.add_argument('-o', '--output',
help='Path to a directory to output the results into. It is created if necessary',
action='store',
default='.',
type=str)

parser.add_argument('-c', '--classifiers',
help='List of classifiers that will be tested. The accepted options are: ' + ', '.join(asreview.models.list_classifiers()), action='extend',
default=asreview.models.list_classifiers(),
nargs='+',
type=str)

parser.add_argument('-q', '--query',
help='List of query strategies that will be tested. The accepted options are: ' + ', '.join(asreview.models.query.list_query_strategies()),
action='extend',
default=asreview.models.query.list_query_strategies(),
nargs='+',
type=str)

parser.add_argument('-b', '--balance',
help='List of balancing strategies that will be tested. The accepted options are: ' + ', '.join(asreview.models.balance.list_balance_strategies()),
action='extend',
default=asreview.models.balance.list_balance_strategies(),
nargs='+',
type=str)

parser.add_argument('-f', '--feature_extraction',
help='List of feature extraction models that will be tested. The accepted options are: ' + ', '.join(asreview.models.feature_extraction.list_feature_extraction()),
action='extend',
default=asreview.models.feature_extraction.list_feature_extraction(),
nargs='+',
type=str)

parser.add_argument('-p', '--prior',
help='List of the number of prelabelled papers to include formatted like: prior_included,prior_excluded. For example the input could look like --prior 1,1 5,5 5,10',
action='extend',
default=['1,1', '5,5', '5,10'],
nargs='+',
type=str)

parser.add_argument('-n', '--n_instances',
help='The number of iterations per test',
action='store',
default=1,
type=int)

parser.add_argument('-P', '--preset',
help='The name of the preset test to use. Valid options are: ' + ', '.join(presets.keys()),
action='store',
default='none',
type=str)

parser.add_argument('-s', '--seed',
help='The random seed for reproducibility.',
action='store',
default=0,
type=int)

args = parser.parse_args()
print(args)

if args.preset in presets:
for k, v in presets[args.preset].items():
args.__dict__[k] = v

# TODO: Download embedding files and enable lstm models


if not os.path.exists(args.filename):
raise FileNotFoundError('Data file \'' + args.filename + '\' not found.')

print()
print(f'Model testing will take {len(args.classifiers) * len(args.query) * len(args.balance) * len(args.feature_extraction) * len(args.prior) * args.n_instances} iterations.\n'
+ 'Each iteration can take up to an hour, and the output files can be up to a few GB, depending on the type of model and dataset used.\n'
+ 'Are you sure you would like to continue? [Yn]', end=' ')

resp = input()

if resp.lower().startswith('n'):
print('Exiting.')
sys.exit(0)

os.makedirs(args.output, exist_ok=True)

print('Running models')

dataset = asreview.ASReviewData.from_file(args.filename)

# Try run a simulation with every combination of the inputted options
for classifier_name, query_name, balance_name, feature_extraction_name, prior in itertools.product(args.classifiers, args.query, args.balance, args.feature_extraction, args.prior):

args.seed += 1

print(f"Classifier: '{classifier_name}', feature extraction: '{feature_extraction_name}', query strategy: '{query_name}', balancing strategy: '{balance_name}', prior amounts: '{prior}'")
start_time = time.time()
# TODO: Enable lstm models
if 'lstm' in classifier_name or 'lstm' in feature_extraction_name:
print('Skipping iteration because lstm models are not supported')

classifier = asreview.models.classifiers.get_classifier(classifier_name)
query = asreview.models.query.get_query_model(query_name)
balance = asreview.models.balance.get_balance_model(balance_name)
feature_extraction = asreview.models.feature_extraction.get_feature_model(feature_extraction_name)
prior_included, prior_excluded = [int(i) for i in prior.split(',')]

asreview.ReviewSimulate(dataset,
model=classifier,
query_model=query,
balance_model=balance,
feature_model=feature_extraction,
n_prior_included=prior_included,
n_prior_excluded=prior_excluded,
state_file=os.path.join(args.output,
f'{classifier_name}-{query_name}-{balance_name}-{feature_extraction_name}-{prior}.h5')
).review()


end_time = time.time()
print('Finished in', (end_time - start_time), 'seconds')





0 comments on commit da6f442

Please sign in to comment.