diff --git a/armory/baseline_models/pytorch/deep_speech.py b/armory/baseline_models/pytorch/deep_speech.py deleted file mode 100644 index b3efd23d1..000000000 --- a/armory/baseline_models/pytorch/deep_speech.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -Automatic speech recognition model - -Model contributed by: MITRE Corporation -""" - -from typing import Optional - -from art.estimators.speech_recognition import PyTorchDeepSpeech - -from armory.utils.external_repo import ExternalRepoImport - -# Test for external repo at import time to fail fast -with ExternalRepoImport( - repo="SeanNaren/deepspeech.pytorch@V3.0", - experiment="librispeech_asr_snr_undefended.json", -): - from deepspeech_pytorch.model import DeepSpeech # noqa: F401 - - -def get_art_model( - model_kwargs: dict, wrapper_kwargs: dict, weights_path: Optional[str] = None -) -> PyTorchDeepSpeech: - return PyTorchDeepSpeech(**wrapper_kwargs) diff --git a/armory/baseline_models/pytorch/sincnet.py b/armory/baseline_models/pytorch/sincnet.py deleted file mode 100644 index 37401045d..000000000 --- a/armory/baseline_models/pytorch/sincnet.py +++ /dev/null @@ -1,289 +0,0 @@ -""" -CNN model for raw audio classification - -Model contributed by: MITRE Corporation -Adapted from: https://github.com/mravanelli/SincNet -""" -from typing import Optional - -from art.estimators.classification import PyTorchClassifier -import numpy as np -import torch -from torch import nn - -from armory.utils.external_repo import ExternalRepoImport - -with ExternalRepoImport( - repo="hkakitani/SincNet", - experiment="librispeech_baseline_sincnet.json", -): - from SincNet import dnn_models - -# NOTE: Underlying dataset sample rate is 16 kHz. SincNet uses this SAMPLE_RATE to -# determine internal filter high cutoff frequency. -SAMPLE_RATE = 8000 -WINDOW_STEP_SIZE = 375 -WINDOW_LENGTH = int(SAMPLE_RATE * WINDOW_STEP_SIZE / 1000) - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def numpy_random_preprocessing_fn(batch: np.ndarray): - """ - Standardize, then normalize sound clips - - Then generate a random cut of the input - """ - processed_batch = [] - for clip in batch: - # convert and normalize - signal = clip.astype(np.float32) - # Signal normalization - signal = signal / np.max(np.abs(signal)) - - # make a pseudorandom cut of size equal to WINDOW_LENGTH - # (from SincNet's create_batches_rnd) - signal_length = len(signal) - np.random.seed(signal_length) - signal_start = int( - np.random.randint(signal_length / WINDOW_LENGTH - 1) - * WINDOW_LENGTH - % signal_length - ) - signal_stop = signal_start + WINDOW_LENGTH - signal = signal[signal_start:signal_stop] - processed_batch.append(signal) - - return np.array(processed_batch) - - -def numpy_all_preprocessing_fn(batch: np.ndarray): - """ - Input is comprised of one or more clips, where each clip i - is given as an ndarray with shape (n_i,). - Preprocessing normalizes each clip and breaks each clip into an integer number - of non-overlapping segments of length WINDOW_LENGTH. - Output is a list of clips, each of shape (int(n_i/WINDOW_LENGTH), WINDOW_LENGTH) - """ - if len(batch) != 1: - raise NotImplementedError( - "Requires ART variable length input capability for batch size != 1" - ) - processed_batch = [] - for clip in batch: - # convert and normalize - signal = clip.astype(np.float64) - signal = signal / np.max(np.abs(signal)) - - # break into a number of chunks of equal length - num_chunks = int(len(signal) / WINDOW_LENGTH) - signal = signal[: num_chunks * WINDOW_LENGTH] - signal = np.reshape(signal, (num_chunks, WINDOW_LENGTH), order="C") - processed_batch.append(signal) - # remove outer batch (of size 1) - processed_batch = processed_batch[0] - return np.array(processed_batch) - - -def torch_random_preprocessing_fn(x): - """ - Standardize, then normalize sound clips - """ - if x.shape[0] != 1: - raise ValueError(f"Shape of batch x {x.shape[0]} != 1") - if x.dtype != torch.float32: - raise ValueError(f"dtype of batch x {x.dtype} != torch.float32") - if x.max() > 1.0: - raise ValueError(f"batch x max {x.max()} > 1.0") - if x.min() < -1.0: - raise ValueError(f"batch x min {x.min()} < -1.0") - x = x.squeeze(0) - - # Signal normalization - x = x / x.abs().max() - - # get pseudorandom chunk of fixed length (from SincNet's create_batches_rnd) - signal_length = len(x) - np.random.seed(signal_length) - start = int( - np.random.randint(signal_length / WINDOW_LENGTH - 1) - * WINDOW_LENGTH - % signal_length - ) - - x = x[start : start + WINDOW_LENGTH] - - x = x.unsqueeze(0) - return x - - -def torch_all_preprocessing_fn(x: torch.Tensor): - """ - Input is comprised of one or more clips, where each clip i - is given as an ndarray with shape (n_i,). - Preprocessing normalizes each clip and breaks each clip into an integer number - of non-overlapping segments of length WINDOW_LENGTH. - Output is a list of clips, each of shape (int(n_i/WINDOW_LENGTH), WINDOW_LENGTH) - """ - if x.shape[0] != 1: - raise NotImplementedError( - "Requires ART variable length input capability for batch size != 1" - ) - if x.max() > 1.0: - raise ValueError(f"batch x max {x.max()} > 1.0") - if x.min() < -1.0: - raise ValueError(f"batch x min {x.min()} < -1.0") - if x.dtype != torch.float32: - raise ValueError(f"dtype of batch x {x.dtype} != torch.float32") - x = x.squeeze(0) - - # Signal normalization - x = x / x.abs().max() - - # break into a number of chunks of equal length - num_chunks = int(len(x) / WINDOW_LENGTH) - x = x[: num_chunks * WINDOW_LENGTH] - x = x.reshape((num_chunks, WINDOW_LENGTH)) - - return x - - -def sincnet(weights_path: Optional[str] = None) -> dnn_models.SincWrapper: - """ - Set configuration options and instantiates SincWrapper object - """ - pretrained = weights_path is not None - if pretrained: - model_params = torch.load(weights_path, map_location=DEVICE) - else: - model_params = {} - CNN_params = model_params.get("CNN_model_par") - DNN1_params = model_params.get("DNN1_model_par") - DNN2_params = model_params.get("DNN2_model_par") - - # from SincNet/cfg/SincNet_dev_LibriSpeech.cfg - cnn_N_filt = [80, 60, 60] - cnn_len_filt = [251, 5, 5] - cnn_max_pool_len = [3, 3, 3] - cnn_use_laynorm_inp = True - cnn_use_batchnorm_inp = False - cnn_use_laynorm = [True, True, True] - cnn_use_batchnorm = [False, False, False] - cnn_act = ["relu", "relu", "relu"] - cnn_drop = [0.0, 0.0, 0.0] - - fc_lay = [2048, 2048, 2048] - fc_drop = [0.0, 0.0, 0.0] - fc_use_laynorm_inp = True - fc_use_batchnorm_inp = False - fc_use_batchnorm = [True, True, True] - fc_use_laynorm = [False, False, False] - fc_act = ["leaky_relu", "linear", "leaky_relu"] - - class_lay = [40] - class_drop = [0.0, 0.0] - class_use_laynorm_inp = True - class_use_batchnorm_inp = False - class_use_batchnorm = [False] - class_use_laynorm = [False] - class_act = ["softmax"] - - CNN_options = { - "input_dim": WINDOW_LENGTH, - "fs": SAMPLE_RATE, - "cnn_N_filt": cnn_N_filt, - "cnn_len_filt": cnn_len_filt, - "cnn_max_pool_len": cnn_max_pool_len, - "cnn_use_laynorm_inp": cnn_use_laynorm_inp, - "cnn_use_batchnorm_inp": cnn_use_batchnorm_inp, - "cnn_use_laynorm": cnn_use_laynorm, - "cnn_use_batchnorm": cnn_use_batchnorm, - "cnn_act": cnn_act, - "cnn_drop": cnn_drop, - "pretrained": pretrained, - "model_params": CNN_params, - } - - DNN1_options = { - "fc_lay": fc_lay, - "fc_drop": fc_drop, - "fc_use_batchnorm": fc_use_batchnorm, - "fc_use_laynorm": fc_use_laynorm, - "fc_use_laynorm_inp": fc_use_laynorm_inp, - "fc_use_batchnorm_inp": fc_use_batchnorm_inp, - "fc_act": fc_act, - "pretrained": pretrained, - "model_params": DNN1_params, - } - - DNN2_options = { - "input_dim": fc_lay[-1], - "fc_lay": class_lay, - "fc_drop": class_drop, - "fc_use_batchnorm": class_use_batchnorm, - "fc_use_laynorm": class_use_laynorm, - "fc_use_laynorm_inp": class_use_laynorm_inp, - "fc_use_batchnorm_inp": class_use_batchnorm_inp, - "fc_act": class_act, - } - - sincNet = dnn_models.SincWrapper(DNN2_options, DNN1_options, CNN_options) - - if pretrained: - sincNet.eval() - sincNet.load_state_dict(DNN2_params) - - else: - sincNet.train() - - return sincNet - - -class SincNetWrapper(nn.Module): - MODES = { - "random": torch_random_preprocessing_fn, - "all": torch_all_preprocessing_fn, - } - - def __init__(self, model_kwargs: dict, weights_path: Optional[str]) -> None: - super().__init__() - predict_mode = model_kwargs.pop("predict_mode", "all") - if predict_mode not in self.MODES: - raise ValueError(f"predict_mode {predict_mode} not in {tuple(self.MODES)}") - self.predict_mode = predict_mode - - self.model = sincnet(weights_path=weights_path, **model_kwargs) - self.model.to(DEVICE) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.training: - # preprocessing should be done before model for arbitrary length input - return self.model(x) - - x = self.MODES[self.predict_mode](x) - output = self.model(x) - if self.predict_mode == "all": - output = torch.mean(output, dim=0, keepdim=True) - return output - - -preprocessing_fn = numpy_random_preprocessing_fn - - -def get_art_model( - model_kwargs: dict, wrapper_kwargs: dict, weights_path: Optional[str] = None -) -> PyTorchClassifier: - model = SincNetWrapper(model_kwargs, weights_path) - model.to(DEVICE) - - wrapped_model = PyTorchClassifier( - model, - loss=torch.nn.NLLLoss(), - optimizer=torch.optim.RMSprop( - model.parameters(), lr=0.001, alpha=0.95, eps=1e-8 - ), - input_shape=(None,), - nb_classes=40, - **wrapper_kwargs, - ) - return wrapped_model diff --git a/armory/datasets/README.md b/armory/datasets/README.md index 76c2d8ac2..27e4f9933 100644 --- a/armory/datasets/README.md +++ b/armory/datasets/README.md @@ -84,6 +84,19 @@ info, ds = load.load("digit") info, ds = load.from_directory("/armory/datasets/new_builds/digit/1.0.8") ``` +### Apache Beam Datasets + +Currently, `librispeech` and `librispeech_dev_clean` use apache beam to build. +Apache beam is not installed by default in the container due to older dependencies. +If building in the container, do: +``` +pip install apache-beam +``` + +When building, armory does not provide beam options by default. +This makes building VERY slow unless overrides are provided. +It is recommended that these are built directly using tfds on the command line. + ## Packaging and Uploading for Cache After a dataset has been successfully built and loaded (locally), it can be packaged and uploaded to the cache. @@ -91,43 +104,44 @@ After a dataset has been successfully built and loaded (locally), it can be pack First, it is recommended that you test the packaging and untarring process without upload/download. In python: -``` +```python from armory.datasets import package -package.package("my_dataset") # creates a tar.gz file -package.update("my_dataset") # adds the tar hash info to "cached_datasets.json" -package.verify("my_dataset") # uses the "cached_datasets.json" information to verify hash information on tar file -package.extract("my_dataset", overwrite=False) # This should raise an error, unless you first remove the built dataset; it will ask you to overwrite -package.extract("my_dataset", overwrite=True) # extracts the tar file into the data directory, overwriting the old one (if overwrite is false, this should raise an error) +my_dataset = "my_dataset" +package.package(my_dataset) # creates a tar.gz file +package.update(my_dataset) # adds the tar hash info to "cached_datasets.json" +package.verify(my_dataset) # uses the "cached_datasets.json" information to verify hash information on tar file +package.extract(my_dataset, overwrite=False) # This should raise an error, unless you first remove the built dataset; it will ask you to overwrite +package.extract(my_dataset, overwrite=True) # extracts the tar file into the data directory, overwriting the old one (if overwrite is false, this should raise an error) ``` If you can successfully load the dataset after extracting it here, this part is good. Now, to upload to s3 (you will need `ARMORY_PRIVATE_S3_ID` and `ARMORY_PRIVATE_S3_KEY`): -``` +```python from armory.datasets import upload -upload.upload("my_dataset") # this will fail, as you need to explicitly force it to be public -upload.upload("my_dataset", public=True) +upload.upload(my_dataset) # this will fail, as you need to explicitly force it to be public +upload.upload(my_dataset, public=True) ``` Or, alternatively to packaging and uploading, you can use this convenience function: -``` -package.add_to_cache("my_dataset", public=True) +```python +package.add_to_cache(my_dataset, public=True) ``` To download, which will download it directly to the tar cache directory, do: ``` from armory.datasets import download -download.download("my_dataset", overwrite=True, verify=True) +download.download(my_dataset, overwrite=True, verify=True) ``` You can also download and extract with: ``` from armory.datasets import load -load.ensure_download_extract("my_dataset", verify=True) +load.ensure_download_extract(my_dataset, verify=True) ``` or just try to load it directly ``` -load.load("my_dataset") +load.load(my_dataset) ``` # Running / Testing with current armory scenario files diff --git a/armory/datasets/cached_datasets.json b/armory/datasets/cached_datasets.json index a3b6a2fd8..add4b96e4 100644 --- a/armory/datasets/cached_datasets.json +++ b/armory/datasets/cached_datasets.json @@ -13,6 +13,13 @@ "url": null, "version": "1.0.8" }, + "librispeech_dev_test": { + "sha256": "5c5c6cb53e458e2415bc4f242122155d51f32d7e78770176afe01acb584c4caa", + "size": 2332265306, + "subdir": "librispeech_dev_test/2.1.0", + "url": null, + "version": "2.1.0" + }, "mnist": { "sha256": "fdc3408e29580367145e95ac7cb1d51e807105b174314cd52c16d27a13b98979", "size": 16920751, diff --git a/armory/datasets/preprocessing.py b/armory/datasets/preprocessing.py index 91e7c15b1..1ca0e4190 100644 --- a/armory/datasets/preprocessing.py +++ b/armory/datasets/preprocessing.py @@ -64,6 +64,24 @@ def xview(element): ) +@register +def librispeech(element, audio_kwargs=None): + # TODO: determine how to fix np.array([], dtype=object) output for text + # https://github.com/tensorflow/tensorflow/issues/34871 + # Our traditional behavior to decode to str once in numpy + # This can be done via: y.astype("U") + # Currently, this is handled by scenarios or metrics after dataset output + # NOTE: 16000 sampling rate + if audio_kwargs is None: + audio_kwargs = {} + text = element["text"] + speech = audio_to_canon(element["speech"], **audio_kwargs) + return (speech, text) + + +librispeech_dev_test = register(librispeech, "librispeech_dev_test") + + def image_to_canon(image, resize=None, target_dtype=tf.float32, input_type="uint8"): """ TFDS Image feature uses (height, width, channels) @@ -98,14 +116,6 @@ def audio_to_canon(audio, resample=None, target_dtype=tf.float32, input_type="in return audio -# config = { -# "preprocessor": "mnist(max_frames=1)" -# "preprocessor_kwargs": { -# "max_frames": null, -# } -# } - - def video_to_canon( video, resize=None, diff --git a/armory/datasets/standard/librispeech_dev_test/__init__.py b/armory/datasets/standard/librispeech_dev_test/__init__.py new file mode 100644 index 000000000..d84f1d722 --- /dev/null +++ b/armory/datasets/standard/librispeech_dev_test/__init__.py @@ -0,0 +1,3 @@ +"""librispeech_dev_test dataset.""" + +from .librispeech_dev_test import LibrispeechDevTest diff --git a/armory/datasets/standard/librispeech_dev_test/checksums.tsv b/armory/datasets/standard/librispeech_dev_test/checksums.tsv new file mode 100644 index 000000000..edb48d2cf --- /dev/null +++ b/armory/datasets/standard/librispeech_dev_test/checksums.tsv @@ -0,0 +1 @@ +# NOTE: This file is empty due to subclassing the existing tfds librispeech builder: https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/audio/librispeech.py diff --git a/armory/datasets/standard/librispeech_dev_test/librispeech_dev_test.py b/armory/datasets/standard/librispeech_dev_test/librispeech_dev_test.py new file mode 100644 index 000000000..5fcb31e72 --- /dev/null +++ b/armory/datasets/standard/librispeech_dev_test/librispeech_dev_test.py @@ -0,0 +1,41 @@ +""" +Subset of librispeech containing just 'dev' and 'test' splits. + +checksums.tsv is empty as it uses the underlying librispeech class. + +NOTE: In order to build, this requires apache beam installed. + In the container, do: `pip install apache-beam` + This is not installed by default due to older dependencies + +NOTE: when building, armory does not provide beam options by default + This makes building VERY slow unless overrides are provided + It is recommended that this is built directly using tfds on the command line + +Using DirectRunner with apache beam, can build with this: + tfds build /workspace/armory/datasets/standard/librispeech_dev_test --data_dir /armory/datasets/new_builds --force_checksums_validation --beam_pipeline_options="runner=DirectRunner,direct_num_workers=16,direct_running_mode=multi_processing" + See: https://beam.apache.org/releases/pydoc/2.43.0/_modules/apache_beam/options/pipeline_options.html#DirectOptions +""" + +import tensorflow_datasets as tfds +from tensorflow_datasets.audio import librispeech + +_SUBSET = ( + "dev_clean", + "dev_other", + "test_clean", + "test_other", +) +_DL_URLS = {k: v for k, v in librispeech._DL_URLS.items() if k in _SUBSET} + + +class LibrispeechDevTest(librispeech.Librispeech): + """DatasetBuilder for subset of Librispeech""" + + def _split_generators(self, dl_manager): + extracted_dirs = dl_manager.download_and_extract(_DL_URLS) + self._populate_metadata(extracted_dirs) + splits = [ + tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v}) + for k, v in extracted_dirs.items() + ] + return splits diff --git a/armory/scenarios/audio_asr.py b/armory/scenarios/audio_asr.py index 93c73aca8..22bb29b2c 100644 --- a/armory/scenarios/audio_asr.py +++ b/armory/scenarios/audio_asr.py @@ -110,5 +110,6 @@ def load_test_dataset(self, test_split_default="test_clean"): def _load_sample_exporter(self): return AudioExporter( self.export_dir, - self.test_dataset.context.sample_rate, + self.test_dataset.info.metadata["sample_rate"], # TODO: smarter way? + # self.test_dataset.info['speech'].sample_rate, # TODO: get in a smarter way ) diff --git a/armory/scenarios/audio_classification.py b/armory/scenarios/audio_classification.py index ef0aa1e90..7bcdf7545 100644 --- a/armory/scenarios/audio_classification.py +++ b/armory/scenarios/audio_classification.py @@ -16,5 +16,6 @@ def load_test_dataset(self): def _load_sample_exporter(self): return AudioExporter( self.export_dir, - self.test_dataset.context.sample_rate, + self.test_dataset.info.metadata["sample_rate"], # TODO: smarter way? + # self.test_dataset.info['speech'].sample_rate, ) diff --git a/docs/baseline_models.md b/docs/baseline_models.md index 4fde37b87..ede6900ca 100644 --- a/docs/baseline_models.md +++ b/docs/baseline_models.md @@ -37,8 +37,6 @@ The model files can be found in [armory/baseline_models/pytorch](../armory/basel | Model | S3 weight_files | |:----------: |:---------------------------------------------:| | Cifar10 CNN | | -| DeepSpeech 2 | | -| Sincnet CNN | `sincnet_librispeech_v1.pth` | | MARS | `mars_ucf101_v1.pth` , `mars_kinetics_v1.pth` | | ResNet50 CNN | `resnet50_imagenet_v1.pth` | | MNIST CNN | `undefended_mnist_5epochs.pth` | @@ -59,4 +57,4 @@ The weights for this model are downloaded from the link listed below. ### Preprocessing Functions Preprocessing functions have been moved inside each model's forward pass. This is to allow each -model to receive as input the canonicalized form of a dataset. \ No newline at end of file +model to receive as input the canonicalized form of a dataset. diff --git a/docs/datasets.md b/docs/datasets.md index 681147a2a..6d60a1850 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -56,14 +56,10 @@ The carla_over_obj_det_train dataset has the same properties as the above mentio | Dataset | Description | x_shape | x_dtype | y_shape | y_dtype | sampling_rate | splits | |:----------: |:-----------: |:-------: |:--------: |:--------: |:-------: |:-------: |:------: | | [digit](https://github.com/Jakobovski/free-spoken-digit-dataset) | Audio dataset of spoken digits | (N, variable_length) | int64 | (N,) | int64 | 8 kHz | train, test | -| [librispeech](http://www.openslr.org/12/) | Librispeech dataset for automatic speech recognition | (N, variable_length) | float32 | (N,) | bytes | 16 kHz | dev_clean, dev_other, test_clean, train_clean100 | -| [librispeech-full](http://www.openslr.org/12/) | Full Librispeech dataset for automatic speech recognition | (N, variable_length) | float32 | (N,) | bytes | 16 kHz | dev_clean, dev_other, test_clean, train_clean100, train_clean360, train_other500 | -| [librispeech_dev_clean](http://www.openslr.org/12/) | Librispeech dev dataset for speaker identification | (N, variable_length) | float32 | (N,) | int64 | 16 kHz | train, validation, test | -| [librispeech_dev_clean_asr](http://www.openslr.org/12) | Librispeech dev dataset for automatic speech recognition | (N, variable_length) | float32 | (N,) | bytes | 16 kHz | train, validation, test | +| [librispeech](http://www.openslr.org/12/) | Librispeech dataset for automatic speech recognition (NOTE: not currently cached. Use TFDS builder.) | (N, variable_length) | float32 | (N,) | bytes | 16 kHz | dev_clean, dev_other, test_clean, test_other, train_clean100, train_clean360, train_other500 | +| [librispeech_dev_test](http://www.openslr.org/12/) | Librispeech with ontly dev and test splits | (N, variable_length) | float32 | (N,) | int64 | 16 kHz | dev_clean, dev_other, test_clean, test_other | | [speech_commands](https://www.tensorflow.org/datasets/catalog/speech_commands) | Speech commands dataset for audio poisoning | (N, variable_length) | float32 | (N,) | int64 | 16 kHz | train, validation, test | -NOTE: because the Librispeech dataset is over 300 GB with all splits, the ```librispeech_full``` dataset has -all splits, whereas the ```librispeech``` dataset does not have the train_clean360 or train_other500 splits.
### Video Datasets @@ -101,9 +97,6 @@ Tensorflow Datasets [library](https://www.tensorflow.org/datasets/catalog/overvi | resisc_45 | train | First 5/7 of dataset | See armory/data/resisc45/resisc45_dataset_partition.py | | | validation | Next 1/7 of dataset | | | | test | Final 1/7 of dataset | | -| librispeech_dev_clean | train | 1371 recordings from dev_clean dataset | Assign discrete clips so at least 50% of audio time | -| | validation | 692 recordings from dev_clean dataset | is in train, at least 25% is in validation, | -| | test | 640 recordings from dev_clean dataset | and the remainder are in test |
diff --git a/scenario_configs/asr_librispeech_entailment.json b/scenario_configs/asr_librispeech_entailment.json index 752937374..b3038f281 120000 --- a/scenario_configs/asr_librispeech_entailment.json +++ b/scenario_configs/asr_librispeech_entailment.json @@ -1 +1 @@ -eval5/asr_librispeech/entailment.json \ No newline at end of file +eval6/asr_librispeech/hubert_entailment.json \ No newline at end of file diff --git a/scenario_configs/asr_librispeech_targeted.json b/scenario_configs/asr_librispeech_targeted.json index 04b2e2ac6..37d7e1bd6 120000 --- a/scenario_configs/asr_librispeech_targeted.json +++ b/scenario_configs/asr_librispeech_targeted.json @@ -1 +1 @@ -eval5/asr_librispeech/untargeted_snr_pgd.json \ No newline at end of file +eval6/asr_librispeech/hubert_targeted_snr_pgd.json \ No newline at end of file diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_defended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_defended.json deleted file mode 100755 index cdecb16ce..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_defended.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "decrease_factor_alpha": 0.5, - "decrease_factor_eps": 0.5, - "eps": 0.0075, - "global_max_length": 562480, - "increase_factor_alpha": 2.0, - "initial_rescale": 1.0, - "learning_rate_1": 0.0001, - "learning_rate_2": 1e-07, - "max_iter_1": 400, - "max_iter_2": 100, - "num_iter_decrease_alpha": 50 - }, - "module": "art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch", - "name": "ImperceptibleASRPyTorch", - "targeted": true, - "targeted_labels": { - "scheme": "matched length", - "transcripts": [ - "REALLY SHORT TEST STRING", - "THE TEST STRING HAS A LENGTH EQUAL TO THE MEDIAN OF THE CLEAN TEST TRANSCRIPT LENGTHS", - "THIS IS AN EXCEEDINGLY LONG TEST STRING BUT NOT REALLY AS THE LONGEST STRING HAS OVER FIVE HUNDRED CHARACTERS IN ITS TRANSCRIPT AND INCLUDES A LIST OF PEOPLE AND SPEAKS OF A SENATOR FROM NEW JERSEY" - ] - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": { - "kwargs": { - "apply_fit": false, - "apply_predict": true, - "channels_first": false, - "sample_rate": 16000, - "verbose": false - }, - "module": "art.defences.preprocessor", - "name": "Mp3CompressionPyTorch", - "type": "Preprocessor" - }, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_undefended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_undefended.json deleted file mode 100755 index 71b02f0dc..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_imperceptible_undefended.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "decrease_factor_alpha": 0.5, - "decrease_factor_eps": 0.5, - "eps": 0.0075, - "global_max_length": 562480, - "increase_factor_alpha": 2.0, - "initial_rescale": 1.0, - "learning_rate_1": 0.0001, - "learning_rate_2": 1e-07, - "max_iter_1": 400, - "max_iter_2": 100, - "num_iter_decrease_alpha": 50 - }, - "module": "art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch", - "name": "ImperceptibleASRPyTorch", - "targeted": true, - "targeted_labels": { - "scheme": "matched length", - "transcripts": [ - "REALLY SHORT TEST STRING", - "THE TEST STRING HAS A LENGTH EQUAL TO THE MEDIAN OF THE CLEAN TEST TRANSCRIPT LENGTHS", - "THIS IS AN EXCEEDINGLY LONG TEST STRING BUT NOT REALLY AS THE LONGEST STRING HAS OVER FIVE HUNDRED CHARACTERS IN ITS TRANSCRIPT AND INCLUDES A LIST OF PEOPLE AND SPEAKS OF A SENATOR FROM NEW JERSEY" - ] - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_defended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_defended.json deleted file mode 100755 index c4d41fb71..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_defended.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "partial_attack": false, - "snr_db": 20, - "targeted": false - }, - "module": "armory.art_experimental.attacks.kenansville_dft", - "name": "KenansvilleDFT", - "use_label": false - }, - "dataset": { - "batch_size": 8, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": { - "kwargs": { - "apply_fit": false, - "apply_predict": true, - "channels_first": false, - "sample_rate": 16000, - "verbose": false - }, - "module": "art.defences.preprocessor", - "name": "Mp3Compression", - "type": "Preprocessor" - }, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_undefended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_undefended.json deleted file mode 100755 index 1a8e25bed..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_kenansville_undefended.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "partial_attack": false, - "snr_db": 20, - "targeted": false - }, - "module": "armory.art_experimental.attacks.kenansville_dft", - "name": "KenansvilleDFT", - "use_label": false - }, - "dataset": { - "batch_size": 8, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_defended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_defended.json deleted file mode 100755 index c54f8ef78..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_defended.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 1.5, - "eps_step": 0.05, - "max_iter": 100, - "norm": 2, - "num_random_init": 0, - "random_eps": false, - "targeted": false, - "verbose": false - }, - "module": "art.attacks.evasion", - "name": "ProjectedGradientDescent", - "targeted": false, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": { - "kwargs": { - "apply_fit": false, - "apply_predict": true, - "channels_first": false, - "sample_rate": 16000, - "verbose": false - }, - "module": "art.defences.preprocessor", - "name": "Mp3Compression", - "type": "Preprocessor" - }, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_multipath_channel_undefended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_multipath_channel_undefended.json deleted file mode 100755 index ac814e83a..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_multipath_channel_undefended.json +++ /dev/null @@ -1,80 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "audio_channel": { - "attenuation": 0.5, - "delay": 300, - "pytorch": true - }, - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 1.5, - "eps_step": 0.05, - "max_iter": 100, - "norm": 2, - "num_random_init": 0, - "random_eps": false, - "targeted": false, - "verbose": false - }, - "module": "art.attacks.evasion", - "name": "ProjectedGradientDescent", - "targeted": false, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_undefended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_undefended.json deleted file mode 100755 index 94a7bef1c..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_pgd_undefended.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 1.5, - "eps_step": 0.05, - "max_iter": 100, - "norm": 2, - "num_random_init": 0, - "random_eps": false, - "targeted": false, - "verbose": false - }, - "module": "art.attacks.evasion", - "name": "ProjectedGradientDescent", - "targeted": false, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_targeted.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_targeted.json deleted file mode 100644 index 263adccac..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_targeted.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 10, - "eps_step": 0.5, - "max_iter": 10, - "norm": "snr", - "num_random_init": 0, - "targeted": true - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD_Numpy", - "targeted": true, - "targeted_labels": { - "kwargs": { - "import_from": "armory.attacks.librispeech_target_labels", - "transcripts": "matched_length" - }, - "module": "armory.utils.labels", - "name": "MatchedTranscriptLengthTargeter" - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "linf", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_undefended.json b/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_undefended.json deleted file mode 100755 index 9ed517ef0..000000000 --- a/scenario_configs/eval1-4/asr_librispeech/librispeech_asr_snr_undefended.json +++ /dev/null @@ -1,80 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 10, - "eps_step": 0.5, - "max_iter": 10, - "norm": "snr", - "num_random_init": 0, - "targeted": true - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD_Numpy", - "targeted": true, - "targeted_labels": { - "kwargs": { - "value": "TEST STRING" - }, - "module": "armory.utils.labels", - "name": "FixedStringTargeter" - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "linf", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet.json b/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet.json deleted file mode 100644 index 71d688d97..000000000 --- a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "_description": "Librispeech_dev_clean raw audio classification, contributed by MITRE Corporation", - "adhoc": null, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 0.2, - "eps_step": 0.1, - "minimal": false, - "num_random_init": 0, - "targeted": false - }, - "module": "art.attacks.evasion", - "name": "FastGradientMethod", - "use_label": false - }, - "dataset": { - "batch_size": 1, - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech_dev_clean" - }, - "defense": null, - "metric": { - "means": true, - "perturbation": "linf", - "record_metric_per_sample": false, - "task": [ - "categorical_accuracy" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "fit_batch_size": 16, - "nb_epochs": 20000 - }, - "model_kwargs": { - "predict_mode": "all" - }, - "module": "armory.baseline_models.pytorch.sincnet", - "name": "get_art_model", - "weights_file": "sincnet_librispeech_v1.pth", - "wrapper_kwargs": { - "clip_values": [ - -1.0, - 1.0 - ] - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_classification", - "name": "AudioClassificationTask" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch", - "external_github_repo": "hkakitani/SincNet", - "gpus": "all", - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_snr_pgd.json b/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_snr_pgd.json deleted file mode 100644 index 8ea65668d..000000000 --- a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_snr_pgd.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "_description": "Librispeech_dev_clean raw audio classification, contributed by MITRE Corporation", - "adhoc": null, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 10, - "eps_step": 0.5, - "max_iter": 10, - "norm": "snr", - "num_random_init": 0, - "targeted": false - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD", - "use_label": false - }, - "dataset": { - "batch_size": 1, - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech_dev_clean" - }, - "defense": null, - "metric": { - "means": true, - "perturbation": [ - "snr", - "snr_db" - ], - "record_metric_per_sample": true, - "task": [ - "categorical_accuracy" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "fit_batch_size": 16, - "nb_epochs": 20000 - }, - "model_kwargs": { - "predict_mode": "all" - }, - "module": "armory.baseline_models.pytorch.sincnet", - "name": "get_art_model", - "weights_file": "sincnet_librispeech_v1.pth", - "wrapper_kwargs": { - "clip_values": [ - -1.0, - 1.0 - ] - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_classification", - "name": "AudioClassificationTask" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch", - "external_github_repo": "hkakitani/SincNet", - "gpus": "all", - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_targeted.json b/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_targeted.json deleted file mode 100644 index 526353755..000000000 --- a/scenario_configs/eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_targeted.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "_description": "Librispeech_dev_clean raw audio classification, contributed by MITRE Corporation", - "adhoc": null, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 0.2, - "eps_step": 0.1, - "minimal": false, - "num_random_init": 0, - "targeted": true - }, - "module": "art.attacks.evasion", - "name": "FastGradientMethod", - "targeted_labels": { - "kwargs": { - "num_classes": 40 - }, - "module": "armory.utils.labels", - "name": "RoundRobinTargeter" - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech_dev_clean" - }, - "defense": null, - "metric": { - "means": true, - "perturbation": "linf", - "record_metric_per_sample": false, - "task": [ - "categorical_accuracy" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "fit_batch_size": 16, - "nb_epochs": 20000 - }, - "model_kwargs": { - "predict_mode": "all" - }, - "module": "armory.baseline_models.pytorch.sincnet", - "name": "get_art_model", - "weights_file": "sincnet_librispeech_v1.pth", - "wrapper_kwargs": { - "clip_values": [ - -1.0, - 1.0 - ] - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_classification", - "name": "AudioClassificationTask" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch", - "external_github_repo": "hkakitani/SincNet", - "gpus": "all", - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval5/asr_librispeech/defended_entailment.json b/scenario_configs/eval5/asr_librispeech/defended_entailment.json deleted file mode 100644 index 5727d7654..000000000 --- a/scenario_configs/eval5/asr_librispeech/defended_entailment.json +++ /dev/null @@ -1,97 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 20, - "eps_step": 0.05, - "max_iter": 500, - "norm": "snr", - "num_random_init": 0, - "targeted": true - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD_Numpy", - "targeted": true, - "targeted_labels": { - "kwargs": { - "dtype": "str", - "import_from": "armory.attacks.librispeech_target_labels", - "values": "entailment_100" - }, - "module": "armory.utils.labels", - "name": "ManualTargeter" - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": { - "kwargs": { - "apply_fit": false, - "apply_predict": true, - "channels_first": false, - "sample_rate": 16000, - "verbose": false - }, - "module": "art.defences.preprocessor", - "name": "Mp3Compression", - "type": "Preprocessor" - }, - "metric": { - "means": false, - "perturbation": "snr_db", - "record_metric_per_sample": true, - "task": [ - "entailment", - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": [ - "SeanNaren/deepspeech.pytorch@V3.0" - ], - "gpus": "all", - "local_repo_path": null, - "num_eval_batches": 100, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval5/asr_librispeech/defended_targeted_snr_pgd.json b/scenario_configs/eval5/asr_librispeech/defended_targeted_snr_pgd.json deleted file mode 100644 index c9ff3fdb2..000000000 --- a/scenario_configs/eval5/asr_librispeech/defended_targeted_snr_pgd.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 20, - "eps_step": 0.5, - "max_iter": 500, - "norm": "snr", - "num_random_init": 0, - "targeted": true - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD_Numpy", - "targeted": true, - "targeted_labels": { - "kwargs": { - "import_from": "armory.attacks.librispeech_target_labels", - "transcripts": "matched_length" - }, - "module": "armory.utils.labels", - "name": "MatchedTranscriptLengthTargeter" - }, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": { - "kwargs": { - "apply_fit": false, - "apply_predict": true, - "channels_first": false, - "sample_rate": 16000, - "verbose": false - }, - "module": "art.defences.preprocessor", - "name": "Mp3Compression", - "type": "Preprocessor" - }, - "metric": { - "means": false, - "perturbation": "linf", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval5/asr_librispeech/untargeted_snr_pgd.json b/scenario_configs/eval5/asr_librispeech/untargeted_snr_pgd.json deleted file mode 100644 index 58a8c1af8..000000000 --- a/scenario_configs/eval5/asr_librispeech/untargeted_snr_pgd.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", - "adhoc": { - "skip_adversarial": false - }, - "attack": { - "knowledge": "white", - "kwargs": { - "batch_size": 1, - "eps": 20, - "eps_step": 0.5, - "max_iter": 500, - "norm": "snr", - "num_random_init": 0, - "targeted": false - }, - "module": "armory.art_experimental.attacks.snr_pgd", - "name": "SNR_PGD_Numpy", - "targeted": false, - "use_label": false - }, - "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" - }, - "defense": null, - "metric": { - "means": false, - "perturbation": "linf", - "record_metric_per_sample": true, - "task": [ - "word_error_rate" - ] - }, - "model": { - "fit": false, - "fit_kwargs": { - "nb_epochs": 20000 - }, - "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", - "name": "get_art_model", - "predict_kwargs": { - "transcription_output": true - }, - "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } - }, - "scenario": { - "kwargs": {}, - "module": "armory.scenarios.audio_asr", - "name": "AutomaticSpeechRecognition" - }, - "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", - "gpus": "all", - "local_repo_path": null, - "output_dir": null, - "output_filename": null, - "use_gpu": false - } -} diff --git a/scenario_configs/eval5/asr_librispeech/defended_untargeted_snr_pgd.json b/scenario_configs/eval6/asr_librispeech/hubert_defended_untargeted.json similarity index 70% rename from scenario_configs/eval5/asr_librispeech/defended_untargeted_snr_pgd.json rename to scenario_configs/eval6/asr_librispeech/hubert_defended_untargeted.json index 4c128b261..006318915 100644 --- a/scenario_configs/eval5/asr_librispeech/defended_untargeted_snr_pgd.json +++ b/scenario_configs/eval6/asr_librispeech/hubert_defended_untargeted.json @@ -1,5 +1,5 @@ { - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", + "_description": "Baseline HuBERT ASR on LibriSpeech", "adhoc": { "skip_adversarial": false }, @@ -20,12 +20,11 @@ "use_label": false }, "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" + "test": { + "batch_size": 1, + "name": "librispeech_dev_test", + "split": "test_clean" + } }, "defense": { "kwargs": { @@ -41,7 +40,7 @@ }, "metric": { "means": false, - "perturbation": "linf", + "perturbation": "snr_db", "record_metric_per_sample": true, "task": [ "word_error_rate" @@ -53,19 +52,13 @@ "nb_epochs": 20000 }, "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", + "module": "armory.baseline_models.pytorch.hubert_asr_large", "name": "get_art_model", "predict_kwargs": { "transcription_output": true }, "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } + "wrapper_kwargs": {} }, "scenario": { "kwargs": {}, @@ -73,8 +66,8 @@ "name": "AutomaticSpeechRecognition" }, "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", + "docker_image": "twosixarmory/pytorch", + "external_github_repo": null, "gpus": "all", "local_repo_path": null, "output_dir": null, diff --git a/scenario_configs/eval5/asr_librispeech/entailment.json b/scenario_configs/eval6/asr_librispeech/hubert_entailment.json similarity index 70% rename from scenario_configs/eval5/asr_librispeech/entailment.json rename to scenario_configs/eval6/asr_librispeech/hubert_entailment.json index 21f5ff3e1..1a3388db5 100644 --- a/scenario_configs/eval5/asr_librispeech/entailment.json +++ b/scenario_configs/eval6/asr_librispeech/hubert_entailment.json @@ -1,5 +1,5 @@ { - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", + "_description": "Baseline HuBERT ASR on LibriSpeech", "adhoc": { "skip_adversarial": false }, @@ -29,12 +29,11 @@ "use_label": false }, "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" + "test": { + "batch_size": 1, + "name": "librispeech_dev_test", + "split": "test_clean" + } }, "defense": null, "metric": { @@ -52,19 +51,13 @@ "nb_epochs": 20000 }, "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", + "module": "armory.baseline_models.pytorch.hubert_asr_large", "name": "get_art_model", "predict_kwargs": { "transcription_output": true }, "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } + "wrapper_kwargs": {} }, "scenario": { "kwargs": {}, @@ -72,13 +65,10 @@ "name": "AutomaticSpeechRecognition" }, "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": [ - "SeanNaren/deepspeech.pytorch@V3.0" - ], + "docker_image": "twosixarmory/pytorch", + "external_github_repo": null, "gpus": "all", "local_repo_path": null, - "num_eval_batches": 100, "output_dir": null, "output_filename": null, "use_gpu": false diff --git a/scenario_configs/eval5/asr_librispeech/targeted_snr_pgd.json b/scenario_configs/eval6/asr_librispeech/hubert_targeted_snr_pgd.json similarity index 69% rename from scenario_configs/eval5/asr_librispeech/targeted_snr_pgd.json rename to scenario_configs/eval6/asr_librispeech/hubert_targeted_snr_pgd.json index f650a46eb..5f469cca6 100644 --- a/scenario_configs/eval5/asr_librispeech/targeted_snr_pgd.json +++ b/scenario_configs/eval6/asr_librispeech/hubert_targeted_snr_pgd.json @@ -1,5 +1,5 @@ { - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", + "_description": "Baseline HuBERT ASR on LibriSpeech", "adhoc": { "skip_adversarial": false }, @@ -28,17 +28,16 @@ "use_label": false }, "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" + "test": { + "batch_size": 1, + "name": "librispeech_dev_test", + "split": "test_clean" + } }, "defense": null, "metric": { "means": false, - "perturbation": "linf", + "perturbation": "snr_db", "record_metric_per_sample": true, "task": [ "word_error_rate" @@ -50,19 +49,13 @@ "nb_epochs": 20000 }, "model_kwargs": {}, - "module": "armory.baseline_models.pytorch.deep_speech", + "module": "armory.baseline_models.pytorch.hubert_asr_large", "name": "get_art_model", "predict_kwargs": { "transcription_output": true }, "weights_file": null, - "wrapper_kwargs": { - "clip_values": [ - -1, - 1 - ], - "pretrained_model": "librispeech" - } + "wrapper_kwargs": {} }, "scenario": { "kwargs": {}, @@ -70,8 +63,8 @@ "name": "AutomaticSpeechRecognition" }, "sysconfig": { - "docker_image": "twosixarmory/pytorch-deepspeech", - "external_github_repo": "SeanNaren/deepspeech.pytorch@V3.0", + "docker_image": "twosixarmory/pytorch", + "external_github_repo": null, "gpus": "all", "local_repo_path": null, "output_dir": null, diff --git a/scenario_configs/eval6/asr_librispeech/hubert_untargeted_snr_pgd.json b/scenario_configs/eval6/asr_librispeech/hubert_untargeted_snr_pgd.json index 25b1b5bc0..3ce122237 100644 --- a/scenario_configs/eval6/asr_librispeech/hubert_untargeted_snr_pgd.json +++ b/scenario_configs/eval6/asr_librispeech/hubert_untargeted_snr_pgd.json @@ -1,5 +1,5 @@ { - "_description": "Baseline DeepSpeech ASR on LibriSpeech, contributed by MITRE Corporation", + "_description": "Baseline HuBERT ASR on LibriSpeech", "adhoc": { "skip_adversarial": false }, @@ -20,17 +20,16 @@ "use_label": false }, "dataset": { - "batch_size": 1, - "eval_split": "test_clean", - "framework": "numpy", - "module": "armory.data.datasets", - "name": "librispeech", - "train_split": "train_clean100" + "test": { + "batch_size": 1, + "name": "librispeech_dev_test", + "split": "test_clean" + } }, "defense": null, "metric": { "means": false, - "perturbation": "linf", + "perturbation": "snr_db", "record_metric_per_sample": true, "task": [ "word_error_rate" diff --git a/scenario_configs/speaker_id_librispeech.json b/scenario_configs/speaker_id_librispeech.json deleted file mode 120000 index c9d0b713e..000000000 --- a/scenario_configs/speaker_id_librispeech.json +++ /dev/null @@ -1 +0,0 @@ -eval1-4/speaker_id_librispeech/librispeech_baseline_sincnet_snr_pgd.json \ No newline at end of file