From da6f442a500c6fc5ed2bb4917d2ba0c93d23f323 Mon Sep 17 00:00:00 2001 From: Alex Waese-Perlman Date: Sun, 9 May 2021 13:21:05 -0400 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 43 ++++++++++++++ test-models.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 test-models.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d5b5a06 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +embeddings/* +data/* diff --git a/README.md b/README.md new file mode 100644 index 0000000..8f3aeb9 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Exhaustive ASReview model search +This repository allows a user to test all combinations of selected asreview model types and strategies. + +## Usage +Start by running `pip install asreview[all]` to install the required packages. Then run `python test-models.py -h` to see how to run a search. It should look something like this: +``` +usage: test-models.py [-h] [-o OUTPUT] [-c CLASSIFIERS [CLASSIFIERS ...]] [-q QUERY [QUERY ...]] [-b BALANCE [BALANCE ...]] + [-f FEATURE_EXTRACTION [FEATURE_EXTRACTION ...]] [-p PRIOR [PRIOR ...]] [-n N_INSTANCES] [-P PRESET] + [-s SEED] + filename + +Test many different asreview models out on one dataset to see which perform the best + +positional arguments: + filename Path to a labelled csv of abstracts. It should have four columns labelled: "Title", "Abstract", + "Authors", "Included". + +optional arguments: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Path to a directory to output the results into. It is created if necessary + -c CLASSIFIERS [CLASSIFIERS ...], --classifiers CLASSIFIERS [CLASSIFIERS ...] + List of classifiers that will be tested. The accepted options are: logistic, lstm-base, lstm-pool, + nb, nn-2-layer, rf, svm + -q QUERY [QUERY ...], --query QUERY [QUERY ...] + List of query strategies that will be tested. The accepted options are: cluster, max, random, + uncertainty + -b BALANCE [BALANCE ...], --balance BALANCE [BALANCE ...] + List of balancing strategies that will be tested. The accepted options are: double, simple, triple, + undersample + -f FEATURE_EXTRACTION [FEATURE_EXTRACTION ...], --feature_extraction FEATURE_EXTRACTION [FEATURE_EXTRACTION ...] + List of feature extraction models that will be tested. The accepted options are: doc2vec, embedding- + idf, embedding-lstm, sbert, tfidf + -p PRIOR [PRIOR ...], --prior PRIOR [PRIOR ...] + List of the number of prelabelled papers to include formatted like: prior_included,prior_excluded. + For example the input could look like --prior 1,1 5,5 5,10 + -n N_INSTANCES, --n_instances N_INSTANCES + The number of iterations per test + -P PRESET, --preset PRESET + The name of the preset test to use. Valid options are: default + -s SEED, --seed SEED The random seed for reproducibility. +``` + diff --git a/test-models.py b/test-models.py new file mode 100644 index 0000000..23069b1 --- /dev/null +++ b/test-models.py @@ -0,0 +1,150 @@ +import argparse +import itertools +import asreview +from urllib.request import urlretrieve +import os +import zipfile +import sys +import time +import asreview.review +import asreview.analysis + +presets = { + 'default': { + 'classifiers': ['logistic', 'nb', 'nn-2-layer', 'rf', 'svm'], + 'query': ['max', 'cluster'], + 'balance': ['simple'], + 'feature_extraction': ['tfidf', 'sbert', 'doc2vec'], + 'prior': ['1,1', '5,5', '10,10'], + 'n_instances': 1 + } +} + +print(asreview.models.query.list_query_strategies()) +parser = argparse.ArgumentParser(description='Test many different asreview models out on one dataset to see which perform the best') +parser.add_argument('filename', type=str, help='Path to a labelled csv of abstracts. It should have four columns labelled: "Title", "Abstract", "Authors", "Included".') + +parser.add_argument('-o', '--output', + help='Path to a directory to output the results into. It is created if necessary', + action='store', + default='.', + type=str) + +parser.add_argument('-c', '--classifiers', + help='List of classifiers that will be tested. The accepted options are: ' + ', '.join(asreview.models.list_classifiers()), action='extend', + default=asreview.models.list_classifiers(), + nargs='+', + type=str) + +parser.add_argument('-q', '--query', + help='List of query strategies that will be tested. The accepted options are: ' + ', '.join(asreview.models.query.list_query_strategies()), + action='extend', + default=asreview.models.query.list_query_strategies(), + nargs='+', + type=str) + +parser.add_argument('-b', '--balance', + help='List of balancing strategies that will be tested. The accepted options are: ' + ', '.join(asreview.models.balance.list_balance_strategies()), + action='extend', + default=asreview.models.balance.list_balance_strategies(), + nargs='+', + type=str) + +parser.add_argument('-f', '--feature_extraction', + help='List of feature extraction models that will be tested. The accepted options are: ' + ', '.join(asreview.models.feature_extraction.list_feature_extraction()), + action='extend', + default=asreview.models.feature_extraction.list_feature_extraction(), + nargs='+', + type=str) + +parser.add_argument('-p', '--prior', + help='List of the number of prelabelled papers to include formatted like: prior_included,prior_excluded. For example the input could look like --prior 1,1 5,5 5,10', + action='extend', + default=['1,1', '5,5', '5,10'], + nargs='+', + type=str) + +parser.add_argument('-n', '--n_instances', + help='The number of iterations per test', + action='store', + default=1, + type=int) + +parser.add_argument('-P', '--preset', + help='The name of the preset test to use. Valid options are: ' + ', '.join(presets.keys()), + action='store', + default='none', + type=str) + +parser.add_argument('-s', '--seed', + help='The random seed for reproducibility.', + action='store', + default=0, + type=int) + +args = parser.parse_args() +print(args) + +if args.preset in presets: + for k, v in presets[args.preset].items(): + args.__dict__[k] = v + +# TODO: Download embedding files and enable lstm models + + +if not os.path.exists(args.filename): + raise FileNotFoundError('Data file \'' + args.filename + '\' not found.') + +print() +print(f'Model testing will take {len(args.classifiers) * len(args.query) * len(args.balance) * len(args.feature_extraction) * len(args.prior) * args.n_instances} iterations.\n' + + 'Each iteration can take up to an hour, and the output files can be up to a few GB, depending on the type of model and dataset used.\n' + + 'Are you sure you would like to continue? [Yn]', end=' ') + +resp = input() + +if resp.lower().startswith('n'): + print('Exiting.') + sys.exit(0) + +os.makedirs(args.output, exist_ok=True) + +print('Running models') + +dataset = asreview.ASReviewData.from_file(args.filename) + +# Try run a simulation with every combination of the inputted options +for classifier_name, query_name, balance_name, feature_extraction_name, prior in itertools.product(args.classifiers, args.query, args.balance, args.feature_extraction, args.prior): + + args.seed += 1 + + print(f"Classifier: '{classifier_name}', feature extraction: '{feature_extraction_name}', query strategy: '{query_name}', balancing strategy: '{balance_name}', prior amounts: '{prior}'") + start_time = time.time() + # TODO: Enable lstm models + if 'lstm' in classifier_name or 'lstm' in feature_extraction_name: + print('Skipping iteration because lstm models are not supported') + + classifier = asreview.models.classifiers.get_classifier(classifier_name) + query = asreview.models.query.get_query_model(query_name) + balance = asreview.models.balance.get_balance_model(balance_name) + feature_extraction = asreview.models.feature_extraction.get_feature_model(feature_extraction_name) + prior_included, prior_excluded = [int(i) for i in prior.split(',')] + + asreview.ReviewSimulate(dataset, + model=classifier, + query_model=query, + balance_model=balance, + feature_model=feature_extraction, + n_prior_included=prior_included, + n_prior_excluded=prior_excluded, + state_file=os.path.join(args.output, + f'{classifier_name}-{query_name}-{balance_name}-{feature_extraction_name}-{prior}.h5') + ).review() + + + end_time = time.time() + print('Finished in', (end_time - start_time), 'seconds') + + + + +