diff --git a/.gitignore b/.gitignore index 6affb0c..c04f256 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,5 @@ dmypy.json .pyre/ # End of https://www.gitignore.io/api/python + +src/caduceus/_version.py \ No newline at end of file diff --git a/configs/model/caduceus.yaml b/configs/model/caduceus.yaml index cd34ca9..d82177a 100644 --- a/configs/model/caduceus.yaml +++ b/configs/model/caduceus.yaml @@ -1,7 +1,7 @@ # Use open-source version of Mamba _name_: caduceus_lm config: - _target_: caduceus.configuration_caduceus.CaduceusConfig + _target_: caduceus.huggingface.configuration_caduceus.CaduceusConfig # From original MambaConfig d_model: 128 n_layer: 2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ca5857f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,84 @@ +[build-system] +requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +write_to = "src/caduceus/_version.py" +version_scheme = "post-release" +fallback_version = "0.0.0" + +[tool.black] +line-length = 88 +include = '\.pyi?$' +exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +known_first_party = ["caduceus"] +line_length = 88 + +[tool.pytest.ini_options] +addopts = "-ra" + +[tool.coverage.run] +branch = true +source = ["src/caduceus"] + +[tool.coverage.paths] +source = ["src/caduceus", "*/site-packages"] + +[tool.coverage.report] +show_missing = true +exclude_lines = [ + "pragma: no cover", + "if __name__ == .__main__.:", + "if typing.TYPE_CHECKING:", + "if TYPE_CHECKING:", + "raise NotImplementedError", + "raise AssertionError", + "@overload", +] + +[tool.mypy] +# Error output +show_column_numbers = true +show_error_codes = true +show_error_context = true +show_traceback = true +pretty = true +check_untyped_defs = false +# Warnings +warn_no_return = true +warn_redundant_casts = true +warn_unreachable = true +files = ["src/caduceus", "tests"] + +[tool.pylint.format] +max-line-length = 88 + +[tool.pylint.message_control] +enable = ["c-extension-no-member", "no-else-return"] + +[tool.pylint.variables] +dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_" +ignored-argument-names = "_.*|^ignored_|^unused_|args|kwargs" + +[tool.codespell] +ignore-words-list = " " + +[tool.bandit] +exclude_dirs = ["tests"] +skips = ["B101"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0e3b6ce --- /dev/null +++ b/setup.cfg @@ -0,0 +1,54 @@ +[metadata] +description = Bi-Directional Equivariant Long-Range DNA Sequence Modeling +name = caduceus +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/kuleshov-group/caduceus +platforms=any +authors = Yair Schiff +classifiers = + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + +[options] +python_requires = >=3.8 +zip_safe = False +package_dir= + =src +packages = find: +install_requires= + biopython + causal-conv1d + datasets + einops + genomic-benchmarks + huggingface-hub + hydra-core + mamba-ssm + omegaconf + lightning[extra] + rich + scikit-learn + timm + torchmetrics + transformers + triton + pyfaidx + pandas + +[options.extras_require] +tests = + pytest +dev = + %(tests)s + pre-commit + +[options.packages.find] +exclude = + tests + tests.* +where=src diff --git a/src/caduceus/__init__.py b/src/caduceus/__init__.py new file mode 100644 index 0000000..1e6cf1a --- /dev/null +++ b/src/caduceus/__init__.py @@ -0,0 +1,13 @@ +"""Caduceus""" + +import warnings + +try: + from caduceus._version import __version__ +except ImportError: + __version__ = "not-installed" + warnings.warn( + "You are running a non-installed version caduceus." + "If you are running this from a git repo, please run" + "`pip install -e .` to install the package." + ) \ No newline at end of file diff --git a/src/__init__.py b/src/caduceus/callbacks/__init__.py similarity index 100% rename from src/__init__.py rename to src/caduceus/callbacks/__init__.py diff --git a/src/callbacks/params.py b/src/caduceus/callbacks/params.py similarity index 100% rename from src/callbacks/params.py rename to src/caduceus/callbacks/params.py diff --git a/src/callbacks/timer.py b/src/caduceus/callbacks/timer.py similarity index 100% rename from src/callbacks/timer.py rename to src/caduceus/callbacks/timer.py diff --git a/src/callbacks/validation.py b/src/caduceus/callbacks/validation.py similarity index 100% rename from src/callbacks/validation.py rename to src/caduceus/callbacks/validation.py diff --git a/src/dataloaders/__init__.py b/src/caduceus/dataloaders/__init__.py similarity index 100% rename from src/dataloaders/__init__.py rename to src/caduceus/dataloaders/__init__.py diff --git a/src/dataloaders/base.py b/src/caduceus/dataloaders/base.py similarity index 98% rename from src/dataloaders/base.py rename to src/caduceus/dataloaders/base.py index f50f260..40a22f1 100644 --- a/src/dataloaders/base.py +++ b/src/caduceus/dataloaders/base.py @@ -11,7 +11,7 @@ # Default data path is environment variable or /data if (default_data_path := os.getenv("DATA_PATH")) is None: - default_data_path = Path(__file__).parent.parent.parent.absolute() + default_data_path = Path(__file__).parent.parent.parent.parent.absolute() default_data_path = default_data_path / "data" else: default_data_path = Path(default_data_path).absolute() diff --git a/src/models/__init__.py b/src/caduceus/dataloaders/datasets/__init__.py similarity index 100% rename from src/models/__init__.py rename to src/caduceus/dataloaders/datasets/__init__.py diff --git a/src/dataloaders/datasets/genomic_bench_dataset.py b/src/caduceus/dataloaders/datasets/genomic_bench_dataset.py similarity index 98% rename from src/dataloaders/datasets/genomic_bench_dataset.py rename to src/caduceus/dataloaders/datasets/genomic_bench_dataset.py index 14a104c..aa82b50 100644 --- a/src/dataloaders/datasets/genomic_bench_dataset.py +++ b/src/caduceus/dataloaders/datasets/genomic_bench_dataset.py @@ -9,7 +9,7 @@ from genomic_benchmarks.data_check import is_downloaded from genomic_benchmarks.loc2seq import download_dataset -from src.dataloaders.utils.rc import coin_flip, string_reverse_complement +from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement class GenomicBenchmarkDataset(torch.utils.data.Dataset): diff --git a/src/dataloaders/datasets/hg38_char_tokenizer.py b/src/caduceus/dataloaders/datasets/hg38_char_tokenizer.py similarity index 100% rename from src/dataloaders/datasets/hg38_char_tokenizer.py rename to src/caduceus/dataloaders/datasets/hg38_char_tokenizer.py diff --git a/src/dataloaders/datasets/hg38_dataset.py b/src/caduceus/dataloaders/datasets/hg38_dataset.py similarity index 98% rename from src/dataloaders/datasets/hg38_dataset.py rename to src/caduceus/dataloaders/datasets/hg38_dataset.py index b140450..e60fbe6 100644 --- a/src/dataloaders/datasets/hg38_dataset.py +++ b/src/caduceus/dataloaders/datasets/hg38_dataset.py @@ -9,8 +9,8 @@ import torch from pyfaidx import Fasta -from src.dataloaders.utils.mlm import mlm_getitem -from src.dataloaders.utils.rc import coin_flip, string_reverse_complement +from caduceus.dataloaders.utils.mlm import mlm_getitem +from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement MAX_ALLOWED_LENGTH = 2 ** 20 diff --git a/src/dataloaders/datasets/nucleotide_transformer_dataset.py b/src/caduceus/dataloaders/datasets/nucleotide_transformer_dataset.py similarity index 97% rename from src/dataloaders/datasets/nucleotide_transformer_dataset.py rename to src/caduceus/dataloaders/datasets/nucleotide_transformer_dataset.py index ca13f38..4e70478 100644 --- a/src/dataloaders/datasets/nucleotide_transformer_dataset.py +++ b/src/caduceus/dataloaders/datasets/nucleotide_transformer_dataset.py @@ -6,7 +6,7 @@ import torch from datasets import load_dataset -from src.dataloaders.utils.rc import coin_flip, string_reverse_complement +from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement class NucleotideTransformerDataset(torch.utils.data.Dataset): diff --git a/src/dataloaders/fault_tolerant_sampler.py b/src/caduceus/dataloaders/fault_tolerant_sampler.py similarity index 100% rename from src/dataloaders/fault_tolerant_sampler.py rename to src/caduceus/dataloaders/fault_tolerant_sampler.py diff --git a/src/dataloaders/genomics.py b/src/caduceus/dataloaders/genomics.py similarity index 95% rename from src/dataloaders/genomics.py rename to src/caduceus/dataloaders/genomics.py index 1afc2da..1fe72ee 100644 --- a/src/dataloaders/genomics.py +++ b/src/caduceus/dataloaders/genomics.py @@ -13,17 +13,17 @@ from datasets import Dataset from torch.utils.data.dataloader import DataLoader -from caduceus.tokenization_caduceus import CaduceusTokenizer -import src.utils.train -from src.dataloaders.base import SequenceDataset, default_data_path -from src.dataloaders.datasets.genomic_bench_dataset import GenomicBenchmarkDataset -from src.dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer -from src.dataloaders.datasets.hg38_dataset import HG38Dataset -from src.dataloaders.datasets.nucleotide_transformer_dataset import NucleotideTransformerDataset -from src.dataloaders.fault_tolerant_sampler import FaultTolerantDistributedSampler -from src.dataloaders.fault_tolerant_sampler import RandomFaultTolerantSampler - -logger = src.utils.train.get_logger(__name__) +from caduceus.huggingface.tokenization_caduceus import CaduceusTokenizer +import caduceus.utils.train +from caduceus.dataloaders.base import SequenceDataset, default_data_path +from caduceus.dataloaders.datasets.genomic_bench_dataset import GenomicBenchmarkDataset +from caduceus.dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer +from caduceus.dataloaders.datasets.hg38_dataset import HG38Dataset +from caduceus.dataloaders.datasets.nucleotide_transformer_dataset import NucleotideTransformerDataset +from caduceus.dataloaders.fault_tolerant_sampler import FaultTolerantDistributedSampler +from caduceus.dataloaders.fault_tolerant_sampler import RandomFaultTolerantSampler + +logger = caduceus.utils.train.get_logger(__name__) class HG38(SequenceDataset): diff --git a/src/models/baseline/__init__.py b/src/caduceus/dataloaders/utils/__init__.py similarity index 100% rename from src/models/baseline/__init__.py rename to src/caduceus/dataloaders/utils/__init__.py diff --git a/src/dataloaders/utils/mlm.py b/src/caduceus/dataloaders/utils/mlm.py similarity index 100% rename from src/dataloaders/utils/mlm.py rename to src/caduceus/dataloaders/utils/mlm.py diff --git a/src/dataloaders/utils/rc.py b/src/caduceus/dataloaders/utils/rc.py similarity index 100% rename from src/dataloaders/utils/rc.py rename to src/caduceus/dataloaders/utils/rc.py diff --git a/caduceus/__init__.py b/src/caduceus/huggingface/__init__.py similarity index 100% rename from caduceus/__init__.py rename to src/caduceus/huggingface/__init__.py diff --git a/caduceus/configuration_caduceus.py b/src/caduceus/huggingface/configuration_caduceus.py similarity index 100% rename from caduceus/configuration_caduceus.py rename to src/caduceus/huggingface/configuration_caduceus.py diff --git a/caduceus/modeling_caduceus.py b/src/caduceus/huggingface/modeling_caduceus.py similarity index 100% rename from caduceus/modeling_caduceus.py rename to src/caduceus/huggingface/modeling_caduceus.py diff --git a/caduceus/modeling_rcps.py b/src/caduceus/huggingface/modeling_rcps.py similarity index 100% rename from caduceus/modeling_rcps.py rename to src/caduceus/huggingface/modeling_rcps.py diff --git a/src/models/sequence/__init__.py b/src/caduceus/huggingface/tests/__init__.py similarity index 100% rename from src/models/sequence/__init__.py rename to src/caduceus/huggingface/tests/__init__.py diff --git a/caduceus/tests/test_rcps.py b/src/caduceus/huggingface/tests/test_rcps.py similarity index 99% rename from caduceus/tests/test_rcps.py rename to src/caduceus/huggingface/tests/test_rcps.py index 6b0089f..8ecc19b 100644 --- a/caduceus/tests/test_rcps.py +++ b/src/caduceus/huggingface/tests/test_rcps.py @@ -12,11 +12,11 @@ except ImportError: RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None -from caduceus.modeling_rcps import ( +from caduceus.huggingface.modeling_rcps import ( RCPSEmbedding, RCPSAddNormWrapper, RCPSLMHead, RCPSWrapper ) -from caduceus.modeling_caduceus import CaduceusConfig, CaduceusMixerModel, CaduceusForMaskedLM, create_block +from caduceus.huggingface.modeling_caduceus import CaduceusConfig, CaduceusMixerModel, CaduceusForMaskedLM, create_block @pytest.mark.parametrize("batch_size", [4]) diff --git a/caduceus/tokenization_caduceus.py b/src/caduceus/huggingface/tokenization_caduceus.py similarity index 100% rename from caduceus/tokenization_caduceus.py rename to src/caduceus/huggingface/tokenization_caduceus.py diff --git a/src/caduceus/models/__init__.py b/src/caduceus/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/caduceus/models/baseline/__init__.py b/src/caduceus/models/baseline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/baseline/genomics_benchmark_cnn.py b/src/caduceus/models/baseline/genomics_benchmark_cnn.py similarity index 100% rename from src/models/baseline/genomics_benchmark_cnn.py rename to src/caduceus/models/baseline/genomics_benchmark_cnn.py diff --git a/src/models/nn/__init__.py b/src/caduceus/models/nn/__init__.py similarity index 100% rename from src/models/nn/__init__.py rename to src/caduceus/models/nn/__init__.py diff --git a/src/models/nn/activation.py b/src/caduceus/models/nn/activation.py similarity index 100% rename from src/models/nn/activation.py rename to src/caduceus/models/nn/activation.py diff --git a/src/models/nn/adaptive_softmax.py b/src/caduceus/models/nn/adaptive_softmax.py similarity index 100% rename from src/models/nn/adaptive_softmax.py rename to src/caduceus/models/nn/adaptive_softmax.py diff --git a/src/models/nn/utils.py b/src/caduceus/models/nn/utils.py similarity index 100% rename from src/models/nn/utils.py rename to src/caduceus/models/nn/utils.py diff --git a/src/caduceus/models/sequence/__init__.py b/src/caduceus/models/sequence/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/sequence/dna_embedding.py b/src/caduceus/models/sequence/dna_embedding.py similarity index 97% rename from src/models/sequence/dna_embedding.py rename to src/caduceus/models/sequence/dna_embedding.py index bc0f249..1ef23e1 100644 --- a/src/models/sequence/dna_embedding.py +++ b/src/caduceus/models/sequence/dna_embedding.py @@ -18,10 +18,10 @@ ColumnParallelLinear = None -from caduceus.configuration_caduceus import CaduceusConfig -from caduceus.modeling_caduceus import Caduceus -from src.models.sequence.long_conv_lm import LMBackbone -from src.models.sequence.long_conv_lm import _init_weights +from caduceus.huggingface.configuration_caduceus import CaduceusConfig +from caduceus.huggingface.modeling_caduceus import Caduceus +from caduceus.models.sequence.long_conv_lm import LMBackbone +from caduceus.models.sequence.long_conv_lm import _init_weights class DNAEmbeddingModel(nn.Module, GenerationMixin): diff --git a/src/models/sequence/hyena.py b/src/caduceus/models/sequence/hyena.py similarity index 98% rename from src/models/sequence/hyena.py rename to src/caduceus/models/sequence/hyena.py index b902f63..1d6d392 100644 --- a/src/models/sequence/hyena.py +++ b/src/caduceus/models/sequence/hyena.py @@ -6,7 +6,7 @@ from einops import rearrange try: - from src.ops.fftconv import fftconv_ref, fftconv_func, fftconv_heads_ref + from caduceus.ops.fftconv import fftconv_ref, fftconv_func, fftconv_heads_ref except ImportError: fftconv_func = None @@ -16,10 +16,10 @@ except ImportError: FusedDense = None -import src.utils.registry as registry -from src.utils.train import OptimModule -from src.utils.config import instantiate, auto_assign_attrs -from src.models.nn import Activation +import caduceus.utils.registry as registry +from caduceus.utils.train import OptimModule +from caduceus.utils.config import instantiate, auto_assign_attrs +from caduceus.models.nn import Activation class FFTConvFuncv2(torch.autograd.Function): diff --git a/src/models/sequence/long_conv_lm.py b/src/caduceus/models/sequence/long_conv_lm.py similarity index 99% rename from src/models/sequence/long_conv_lm.py rename to src/caduceus/models/sequence/long_conv_lm.py index 14ba332..cecd98f 100644 --- a/src/models/sequence/long_conv_lm.py +++ b/src/caduceus/models/sequence/long_conv_lm.py @@ -26,8 +26,8 @@ except ImportError: dropout_add_layer_norm = None -from src.utils import instantiate -import src.utils.registry as registry +from caduceus.utils import instantiate +import caduceus.utils.registry as registry class CheckpointedModule(torch.nn.Module): diff --git a/src/caduceus/ops/__init__.py b/src/caduceus/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ops/fftconv.py b/src/caduceus/ops/fftconv.py similarity index 100% rename from src/ops/fftconv.py rename to src/caduceus/ops/fftconv.py diff --git a/src/caduceus/tasks/__init__.py b/src/caduceus/tasks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tasks/decoders.py b/src/caduceus/tasks/decoders.py similarity index 97% rename from src/tasks/decoders.py rename to src/caduceus/tasks/decoders.py index 92e6ae0..19fc6ee 100644 --- a/src/tasks/decoders.py +++ b/src/caduceus/tasks/decoders.py @@ -6,11 +6,11 @@ import torch.nn as nn import torch.nn.functional as F -import src.models.nn.utils as U -import src.utils as utils -import src.utils.train +import caduceus.models.nn.utils as U +import caduceus.utils as utils +import caduceus.utils.train -log = src.utils.train.get_logger(__name__) +log = caduceus.utils.train.get_logger(__name__) class Decoder(nn.Module): diff --git a/src/tasks/encoders.py b/src/caduceus/tasks/encoders.py similarity index 97% rename from src/tasks/encoders.py rename to src/caduceus/tasks/encoders.py index a1e21a0..0adc994 100644 --- a/src/tasks/encoders.py +++ b/src/caduceus/tasks/encoders.py @@ -1,7 +1,7 @@ from torch import nn -import src.models.nn.utils as U -import src.utils as utils +import caduceus.models.nn.utils as U +import caduceus.utils as utils class Encoder(nn.Module): diff --git a/src/tasks/metrics.py b/src/caduceus/tasks/metrics.py similarity index 99% rename from src/tasks/metrics.py rename to src/caduceus/tasks/metrics.py index 7cb9550..a8a2f65 100644 --- a/src/tasks/metrics.py +++ b/src/caduceus/tasks/metrics.py @@ -3,11 +3,11 @@ import torch import torch.nn.functional as F -import torchmetrics.functional as tm_f +from caduceus.tasks import torchmetrics as tm_f from sklearn.metrics import f1_score, roc_auc_score, matthews_corrcoef from torchmetrics.classification import MulticlassRecall, MulticlassPrecision -from torchmetrics import Metric +from caduceus.tasks.torchmetrics import Metric class CorrectAggregatedMetric(Metric): diff --git a/src/tasks/tasks.py b/src/caduceus/tasks/tasks.py similarity index 97% rename from src/tasks/tasks.py rename to src/caduceus/tasks/tasks.py index caf89b5..961c4d6 100644 --- a/src/tasks/tasks.py +++ b/src/caduceus/tasks/tasks.py @@ -4,12 +4,12 @@ import torch.nn as nn from einops import rearrange -import src.models.nn.utils as U -import src.tasks.metrics as M -import torchmetrics as tm -from src.models.nn.adaptive_softmax import AdaptiveEmbedding, ProjectedAdaptiveLogSoftmax -from src.tasks.torchmetrics import torchmetric_fns as tm_mine -from src.utils.config import to_list, instantiate +import caduceus.models.nn.utils as U +import caduceus.tasks.metrics as M +from caduceus.tasks import torchmetrics as tm +from caduceus.models.nn.adaptive_softmax import AdaptiveEmbedding, ProjectedAdaptiveLogSoftmax +from caduceus.tasks.torchmetrics import torchmetric_fns as tm_mine +from caduceus.utils.config import to_list, instantiate from torchmetrics import MetricCollection diff --git a/src/tasks/torchmetrics.py b/src/caduceus/tasks/torchmetrics.py similarity index 100% rename from src/tasks/torchmetrics.py rename to src/caduceus/tasks/torchmetrics.py diff --git a/src/utils/__init__.py b/src/caduceus/utils/__init__.py similarity index 100% rename from src/utils/__init__.py rename to src/caduceus/utils/__init__.py diff --git a/src/utils/config.py b/src/caduceus/utils/config.py similarity index 100% rename from src/utils/config.py rename to src/caduceus/utils/config.py diff --git a/src/caduceus/utils/optim/__init__.py b/src/caduceus/utils/optim/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/optim/schedulers.py b/src/caduceus/utils/optim/schedulers.py similarity index 100% rename from src/utils/optim/schedulers.py rename to src/caduceus/utils/optim/schedulers.py diff --git a/src/utils/optim_groups.py b/src/caduceus/utils/optim_groups.py similarity index 100% rename from src/utils/optim_groups.py rename to src/caduceus/utils/optim_groups.py diff --git a/src/utils/registry.py b/src/caduceus/utils/registry.py similarity index 57% rename from src/utils/registry.py rename to src/caduceus/utils/registry.py index ba60fd0..49a116d 100644 --- a/src/utils/registry.py +++ b/src/caduceus/utils/registry.py @@ -7,7 +7,7 @@ "adamw": "torch.optim.AdamW", "rmsprop": "torch.optim.RMSprop", "sgd": "torch.optim.SGD", - "lamb": "src.utils.optim.lamb.JITLamb", + "lamb": "caduceus.utils.optim.lamb.JITLamb", } scheduler = { @@ -19,29 +19,29 @@ "constant_warmup": "transformers.get_constant_schedule_with_warmup", "linear_warmup": "transformers.get_linear_schedule_with_warmup", "cosine_warmup": "transformers.get_cosine_schedule_with_warmup", - "cosine_warmup_timm": "src.utils.optim.schedulers.TimmCosineLRScheduler", + "cosine_warmup_timm": "caduceus.utils.optim.schedulers.TimmCosineLRScheduler", } model = { # Pre-training LM head models - "hyena_lm": "src.models.sequence.long_conv_lm.ConvLMHeadModel", + "hyena_lm": "caduceus.models.sequence.long_conv_lm.ConvLMHeadModel", "mamba_lm": "mamba_ssm.models.mixer_seq_simple.MambaLMHeadModel", - "caduceus_lm": "caduceus.modeling_caduceus.CaduceusForMaskedLM", + "caduceus_lm": "caduceus.huggingface.modeling_caduceus.CaduceusForMaskedLM", # Downstream task embedding backbones - "dna_embedding": "src.models.sequence.dna_embedding.DNAEmbeddingModel", - "dna_embedding_mamba": "src.models.sequence.dna_embedding.DNAEmbeddingModelMamba", - "dna_embedding_caduceus": "src.models.sequence.dna_embedding.DNAEmbeddingModelCaduceus", + "dna_embedding": "caduceus.models.sequence.dna_embedding.DNAEmbeddingModel", + "dna_embedding_mamba": "caduceus.models.sequence.dna_embedding.DNAEmbeddingModelMamba", + "dna_embedding_caduceus": "caduceus.models.sequence.dna_embedding.DNAEmbeddingModelCaduceus", # Baseline for genomics benchmark - "genomics_benchmark_cnn": "src.models.baseline.genomics_benchmark_cnn.GenomicsBenchmarkCNN", + "genomics_benchmark_cnn": "caduceus.models.baseline.genomics_benchmark_cnn.GenomicsBenchmarkCNN", } layer = { - "id": "src.models.sequence.base.SequenceIdentity", - "ff": "src.models.sequence.ff.FF", - "hyena": "src.models.sequence.hyena.HyenaOperator", - "hyena-filter": "src.models.sequence.hyena.HyenaFilter", + "id": "caduceus.models.sequence.base.SequenceIdentity", + "ff": "caduceus.models.sequence.ff.FF", + "hyena": "caduceus.models.sequence.hyena.HyenaOperator", + "hyena-filter": "caduceus.models.sequence.hyena.HyenaFilter", } callbacks = { @@ -53,11 +53,11 @@ "swa": "pytorch_lightning.callbacks.StochasticWeightAveraging", "rich_model_summary": "pytorch_lightning.callbacks.RichModelSummary", "rich_progress_bar": "pytorch_lightning.callbacks.RichProgressBar", - "params": "src.callbacks.params.ParamsLog", - "timer": "src.callbacks.timer.Timer", - "val_every_n_global_steps": "src.callbacks.validation.ValEveryNGlobalSteps", + "params": "caduceus.callbacks.params.ParamsLog", + "timer": "caduceus.callbacks.timer.Timer", + "val_every_n_global_steps": "caduceus.callbacks.validation.ValEveryNGlobalSteps", } model_state_hook = { - 'load_backbone': 'src.models.sequence.dna_embedding.load_backbone', + 'load_backbone': 'caduceus.models.sequence.dna_embedding.load_backbone', } diff --git a/src/utils/train.py b/src/caduceus/utils/train.py similarity index 99% rename from src/utils/train.py rename to src/caduceus/utils/train.py index 3b7e48c..d8b8d45 100644 --- a/src/utils/train.py +++ b/src/caduceus/utils/train.py @@ -13,7 +13,7 @@ from omegaconf import DictConfig, OmegaConf from pytorch_lightning.utilities import rank_zero_only -from src.utils.config import omegaconf_filter_keys +from caduceus.utils.config import omegaconf_filter_keys # Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging diff --git a/train.py b/train.py index c49b878..fe0382c 100644 --- a/train.py +++ b/train.py @@ -18,15 +18,16 @@ from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn -import src.models.nn.utils as U -import src.utils as utils -import src.utils.train -from src.dataloaders import SequenceDataset # TODO make registry -from src.tasks import decoders, encoders, tasks -from src.utils import registry -from src.utils.optim_groups import add_optimizer_hooks +# from caduceus import caduceus as U, caduceus +import caduceus +from caduceus import utils +from caduceus.models.nn.utils import PassthroughSequential +from caduceus.dataloaders import SequenceDataset # TODO make registry +from caduceus.tasks import encoders, decoders, tasks +from caduceus.utils import registry +from caduceus.utils.optim_groups import add_optimizer_hooks -log = src.utils.train.get_logger(__name__) +log = caduceus.utils.train.get_logger(__name__) # Turn on TensorFloat32 (speeds up large model training substantially) import torch.backends @@ -226,8 +227,8 @@ def setup(self, stage=None): ) # Extract the modules, so they show up in the top level parameter count - self.encoder = U.PassthroughSequential(self.task.encoder, encoder) - self.decoder = U.PassthroughSequential(decoder, self.task.decoder) + self.encoder = PassthroughSequential(self.task.encoder, encoder) + self.decoder = PassthroughSequential(decoder, self.task.decoder) self.loss = self.task.loss self.loss_val = self.task.loss if hasattr(self.task, 'loss_val'): @@ -523,7 +524,7 @@ def configure_optimizers(self): # Print optimizer info for debugging keys = set([k for hp in hps for k in hp.keys()]) # Special hparams - utils.train.log_optimizer(log, optimizer, keys) + caduceus.utils.train.log_optimizer(log, optimizer, keys) # Configure scheduler if "scheduler" not in self.hparams: return optimizer @@ -703,14 +704,14 @@ def main(config: OmegaConf): # - register evaluation resolver # - filter out keys used only for interpolation # - optional hooks, including disabling python warnings or debug friendly configuration - config = utils.train.process_config(config) + config = caduceus.utils.train.process_config(config) # if config.train.get("compile_model", False): # # See: https://github.com/arogozhnikov/einops/wiki/Using-torch.compile-with-einops # from einops._torch_specific import allow_ops_in_compiled_graph # requires einops>=0.6.1 # allow_ops_in_compiled_graph() # Pretty print config using Rich library - utils.train.print_config(config, resolve=True) + caduceus.utils.train.print_config(config, resolve=True) train(config)