.github/workflows/gh-task-runner-mps.yml

name: GH Task Runner (MPS)
on:
  workflow_dispatch:
    inputs:
      run_task:
        description: 'Task to run'
        required: true
        default: 'arc'
        options:
        - anli
        - arc
        - arithmetic
        - asdiv
        - babi
        - bbh
        - belebele
        - benchmarks
        - bigbench
        - blimp
        - ceval
        - cmmlu
        - code_x_glue
        - coqa
        - crows_pairs
        - csatqa
        - drop
        - fld
        - glue
        - gsm8k
        - headqa
        - hellaswag
        - hendrycks_ethics
        - ifeval
        - kmmlu
        - kobest
        - lambada
        - lambada_cloze
        - lambada_multilingual
        - logiqa
        - logiqa2
        - mathqa
        - mc_taco
        - medmcqa
        - medqa
        - mgsm
        - minerva_math
        - model_written_evals
        - mutual
        - nq_open
        - okapi
        - openbookqa
        - paws-x
        - piqa
        - pile
        - polemo2
        - prost
        - pubmedqa
        - qa4mre
        - qasper
        - race
        - realtoxicityprompts
        - sciq
        - scrolls
        - siqa
        - squadv2
        - storycloze
        - super_glue
        - swag
        - toxigen
        - translation
        - triviaqa
        - truthfulqa
        - unscramble
        - webqs
        - wikitext
        - winogrande
        - wmt2016
        - wsc273
        - xcopa
        - xnli
        - xstorycloze
        - xwinograd
      custom_task:
        description: 'Custom Task to run (overwrites previous)'
        required: false
        default: ''
      num_fewshot:
        description: 'Number of Fewshot Examples'
        required: true
        default: -1
      model_hf_repo:
        description: 'Model Hugging Face Repository'
        required: true
        default: 'RWKV/rwkv-5-world-1b5'
      model_args:
        description: 'Model Arguments'
        required: false
        default: 'dtype="float32",trust_remote_code=True'
      batch_size:
        description: 'Batch Size'
        required: true
        default: 'auto'
      # backend:
      #   description: 'Backend to use'
      #   required: true
      #   default: 'mps'
      #   options:
      #   - nvidia-gpu
      #   - intel-gpu
      #   - amd-gpu
      #   - any-gpu
      # gpu_vram:
      #   description: 'Minimum GPU VRAM (ignored for MPS)'
      #   required: true
      #   default: '24'
      #   options:
      #   - 16
      #   - 24
      #   - 80

env:
  # Get the final task
  RUN_TASK: ${{ github.event.inputs.custom_task || github.event.inputs.run_task }}

  # HF repo to sync to
  HF_REPO_SYNC: rwkv-x-dev/lm-eval-output

  # Model HF repo
  MODEL_HF_REPO: ${{ github.event.inputs.model_hf_repo }}

  # Secrets
  HUGGING_FACE_HUB_TOKEN: ${{secrets.HUGGING_FACE_HUB_TOKEN}}

  # Clean task name, replaces in task name * with _
  CLEAN_TASK_NAME: ${{ env.RUN_TASK//'*'/'_' }}

jobs:
  gh-task-runner-mps:
    # Name of the job
    name: "[MPS-${{ env.RUN_TASK }}] ${{ github.event.inputs.model_hf_repo }} - ${{ github.event.inputs.model_args }}"

    # Due to github worker hard limitation, of 24 hours
    # we apply a timeout of 23 hours instead.
    timeout-minutes: 1380

    # Currently, MacOS ARM is only on v14.
    # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners
    runs-on: macos-14

    # Actual task setup, and run steps
    steps:
      - name: Clone repository
        id: checkout
        uses: actions/checkout@v3
        with:
          submodules: 'recursive'

      - uses: actions/setup-python@v5
        with:
          python-version: '3.11' 
          
      - name: Install dependencies / setup project
        run: |
          pip install -e .
          mkdir -p ./output

      - name: Run Task
        run: |
          # Get the final task to run
          task_to_run=${{ github.event.inputs.custom_task }}
          if [ -z "$task_to_run" ]; then
            task_to_run=${{ github.event.inputs.run_task }}
          fi

          # Check if the few shot setting is larger or euqal to 0
          if [ ${{ github.event.inputs.num_fewshot }} -ge 0 ]; then
            lm_eval --model hf \
            --model_args pretrained=${{ github.event.inputs.model_hf_repo }},${{ github.event.inputs.model_args }} \
            --tasks $task_to_run \
            --batch_size ${{ github.event.inputs.batch_size }} \
            --device mps \
            --num_fewshot ${{ github.event.inputs.num_fewshot }} \
            --log_samples --output_path ./output 2>&1 | tee -a ./output/run-log.txt
          else
            lm_eval --model hf \
            --model_args pretrained=${{ github.event.inputs.model_hf_repo }},${{ github.event.inputs.model_args }} \
            --tasks $task_to_run \
            --batch_size ${{ github.event.inputs.batch_size }} \
            --device mps \
            --log_samples --output_path ./output 2>&1 | tee -a ./output/run-log.txt
          fi

      - name: Upload outputs to HF
        if: always()
        run: |
          ./gh-task-runner/hf-upload-runner.sh "${{ env.HF_REPO_SYNC }}" "${{ env.MODEL_HF_REPO }}/${{ env.CLEAN_TASK_NAME }}/MPS/${{ github.event.inputs.model_args }}" "./output"
      # Note that this is meant to be a contigency measure, in case the HF upload failed
      - name: Save output models
        uses: actions/upload-artifact@v3
        # if: failure()
        if: always()
        with:
          name: output-files
          path: |
            output/*
          # retention-days: 90