tutorials/textcat_goemotions/project.yml

title: "Categorization of emotions in Reddit posts (Text Classification)"
description: | 
  This project uses spaCy to train a text classifier on the [GoEmotions dataset](https://github.com/google-research/google-research/tree/master/goemotions) with options for a pipeline with and without transformer weights. To use the BERT-based config, change the `config` variable in the `project.yml`. 
  
  > The goal of this project is to show how to train a spaCy classifier based on a csv file, not to showcase a model that's ready for production. The GoEmotions dataset has known flaws described [here](https://github.com/google-research/google-research/tree/master/goemotions#disclaimer) as well as label errors resulting from [annotator disagreement](https://www.youtube.com/watch?v=khZ5-AN-n2Y).
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  name: "textcat_goemotions"
  version: "0.0.1"
  # Choose your GPU here
  gpu_id: -1
  # Change this to "bert" to use the transformer-based model
  config: "cnn"

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "metrics", "packages", "scripts", "corpus"]

# Assets that should be downloaded or available in the directory.
assets:
  - dest: "assets/categories.txt"
    url: "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt"
    description: "The categories to train"
  - dest: "assets/train.tsv"
    url: "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv"
    description: "The training data"
  - dest: "assets/dev.tsv"
    url: "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv"
    description: "The development data"
  - dest: "assets/test.tsv"
    url: "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv"
    description: "The test data"
  # Uncomment this asset if you want to download the vectors.
  #- dest: "assets/vectors.zip"
  #  url: "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"

# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
  all:
    - preprocess
    - train
    - evaluate
    - package

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: init-vectors
    help: Download vectors and convert to model
    script:
      - "python -m spacy init vectors en assets/vectors.zip assets/en_fasttext_vectors"
    deps:
      - "assets/vectors.zip"
    outputs_no_cache:
      - "assets/en_vectors_web_lg"

  - name: preprocess
    help: "Convert the corpus to spaCy's format"
    script:
      - "python scripts/convert_corpus.py"
    deps:
      - "assets/train.tsv"
      - "assets/dev.tsv"
      - "assets/test.tsv"
      - "assets/categories.txt"
    outputs_no_cache:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"
      - "corpus/test.spacy"

  - name: train
    help: "Train a spaCy pipeline using the specified corpus and config"
    script:
      - "python -m spacy train ./configs/${vars.config}.cfg -o training/${vars.config} --gpu-id ${vars.gpu_id}"
    deps:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"
      - "configs/${vars.config}.cfg"
    outputs:
      - "training/${vars.config}/model-best"

  - name: evaluate
    help: "Evaluate on the test data and save the metrics"
    script:
      - "python -m spacy evaluate ./training/${vars.config}/model-best ./corpus/test.spacy --output ./metrics/${vars.config}.json"
    deps:
      - "training/${vars.config}/model-best"
      - "corpus/test.spacy"
    outputs:
      - "metrics/${vars.config}.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package ./training/${vars.config}/model-best packages/ --name ${vars.name} --force --version ${vars.version} --build wheel --meta-path meta.json"
    deps:
      - "training/${vars.config}/model-best"
    outputs_no_cache:
      - "packages/${vars.name}-${vars.version}/dist/${vars.name}-${vars.version}.tar.gz"

  - name: visualize
    help: Visualize the model's output interactively using Streamlit
    script:
      - "streamlit run scripts/visualize_model.py training/${vars.config}/model-best \"I’m sorry to hear that friend :(\""
    deps:
      - "scripts/visualize_model.py"
      - "training/${vars.config}/model-best"

  - name: assemble
    help: Combine the model with a pretrained pipeline.
    script:
      - "python -m spacy assemble configs/cnn_with_pretrained.cfg cnn_with_pretrained"
    deps:
      - "training/cnn/model-best"