-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Ricardo Rei
authored and
Ricardo Rei
committed
Apr 17, 2020
0 parents
commit 13019fd
Showing
68 changed files
with
4,808 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
*.pkl | ||
*DS_Store | ||
data/ | ||
experiments/ | ||
configs/ | ||
._data | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# Capitalisation And PuncTuatION (CAPTION) | ||
> PT2020 Transcription project. | ||
In this repository, we explore different strategies for automatic transcription enrichment for ASR data which includes tasks such as automatic capitalization (truecasing) and punctuation recovery. | ||
|
||
## Model architecture: | ||
|
||
 | ||
|
||
### Available Encoders: | ||
- [BERT](https://arxiv.org/abs/1810.04805) | ||
- [RoBERTa](https://arxiv.org/abs/1907.11692) | ||
- [XLM-RoBERTa](https://arxiv.org/pdf/1911.02116.pdf) | ||
|
||
## Requirements: | ||
|
||
This project uses Python >3.6 | ||
|
||
Create a virtual env with (outside the project folder): | ||
|
||
```bash | ||
virtualenv -p python3.6 caption-env | ||
``` | ||
|
||
Activate venv: | ||
```bash | ||
source caption-env/bin/activate | ||
``` | ||
|
||
Finally, run: | ||
```bash | ||
python setup.py install | ||
``` | ||
|
||
If you wish to make changes into the code run: | ||
```bash | ||
pip install -r requirements.txt | ||
pip install -e . | ||
``` | ||
|
||
## Getting Started: | ||
|
||
### Train: | ||
```bash | ||
python caption train -f {your_config_file}.yaml | ||
``` | ||
|
||
### Testing: | ||
```bash | ||
python caption test \ | ||
--checkpoint=some/path/to/your/checkpoint.ckpt \ | ||
--test_csv=path/to/your/testset.csv | ||
``` | ||
|
||
### Tensorboard: | ||
|
||
Launch tensorboard with: | ||
```bash | ||
tensorboard --logdir="experiments/lightning_logs/" | ||
``` | ||
|
||
If you are running experiments in a remote server you can forward your localhost to the server localhost.. | ||
|
||
### How to run the tests: | ||
In order to run the toolkit tests you must run the following command: | ||
|
||
```bash | ||
cd tests | ||
python -m unittest | ||
``` | ||
|
||
### Code Style: | ||
To make sure all the code follows the same style we use [Black](https://github.com/psf/black). |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# -*- coding: utf-8 -*- | ||
import logging | ||
from data_loader import add_data_args | ||
|
||
from models import add_model_args, build_model | ||
from optimizers import add_optimizer_args | ||
from schedulers import add_scheduler_args | ||
from test_tube import HyperOptArgumentParser | ||
from testing import run_testing, setup_testing | ||
from torchnlp.random import set_seed | ||
from training import add_trainer_specific_args, setup_training | ||
from utils import get_main_args_from_yaml, load_yaml_args | ||
|
||
log = logging.getLogger("Shell") | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
|
||
def run_training_pipeline(parser): | ||
parser.add_argument( | ||
"-f", "--config", default=False, type=str, help="Path to a YAML config file." | ||
) | ||
parser.add_argument( | ||
"--optimizer", | ||
default=False, | ||
type=str, | ||
help="Optimizer to be used during training.", | ||
) | ||
parser.add_argument( | ||
"--scheduler", | ||
default=False, | ||
type=str, | ||
help="LR scheduler to be used during training.", | ||
) | ||
parser.add_argument( | ||
"--model", | ||
default=False, | ||
type=str, | ||
help="The estimator architecture we we wish to use.", | ||
) | ||
args, _ = parser.parse_known_args() | ||
|
||
if not args.optimizer and not args.scheduler and not args.model: | ||
optimizer, scheduler, model = get_main_args_from_yaml(args) | ||
else: | ||
optimizer = args.optimizer | ||
scheduler = args.scheduler | ||
model = args.model | ||
|
||
parser = add_optimizer_args(parser, optimizer) | ||
parser = add_scheduler_args(parser, scheduler) | ||
parser = add_model_args(parser, model) | ||
parser = add_trainer_specific_args(parser) | ||
hparams = load_yaml_args(parser=parser, log=log) | ||
|
||
set_seed(hparams.seed) | ||
model = build_model(hparams) | ||
trainer = setup_training(hparams) | ||
|
||
if hparams.load_weights: | ||
model.load_weights(hparams.load_weights) | ||
|
||
log.info(f"{model.__class__.__name__} train starting:") | ||
trainer.fit(model) | ||
|
||
|
||
def run_testing_pipeline(parser): | ||
parser = add_data_args(parser) | ||
parser.add_argument( | ||
"--checkpoint", default=None, help="Checkpoint file path.", | ||
) | ||
hparams = parser.parse_args() | ||
run_testing(hparams) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = HyperOptArgumentParser( | ||
strategy="random_search", description="CAPTION project", add_help=True | ||
) | ||
parser.add_argument( | ||
"pipeline", choices=["train", "test"], help="train a model or test.", | ||
) | ||
args, _ = parser.parse_known_args() | ||
if args.pipeline == "test": | ||
run_testing_pipeline(parser) | ||
else: | ||
run_training_pipeline(parser) |
Oops, something went wrong.