Skip to content

Commit 75356e4

Browse files
authored
Merge pull request #214 from macrocosm-os/dev
Release 4.6.3
2 parents bc90394 + b29ea26 commit 75356e4

File tree

4 files changed

+67
-41
lines changed

4 files changed

+67
-41
lines changed

constants/__init__.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
# ---------------------------------
3535

3636
# Release
37-
__version__ = "4.6.2"
37+
__version__ = "4.6.3"
3838

3939
# Validator schema version
4040
__validator_version__ = "4.6.0"
@@ -173,7 +173,27 @@
173173
epsilon_func=LinearDecay(0.005, 0.0002, 36000),
174174
max_bytes=29 * 1024 * 1024 * 1024,
175175
),
176-
176+
# This constraint is not actually used, it is added as a copy
177+
# of the 14B-model competition constraint entry.
178+
# This is just to keep the size of the constraint dict equal
179+
# to the number of competitions so `update_models_limit` is
180+
# set correctly below.
181+
# This hack will be removed once native support for multi datasets
182+
# is implemented in a future release.
183+
CompetitionId.B14_MODEL_MULTI_DATASET: ModelConstraints(
184+
max_model_parameter_size=13_900_000_000,
185+
min_model_parameter_size=13_700_000_000,
186+
sequence_length=4096,
187+
allowed_architectures=ALLOWED_MODEL_TYPES_2,
188+
tokenizer="Xenova/gpt-4",
189+
kwargs={
190+
"torch_dtype": torch.bfloat16,
191+
"attn_implementation": "flash_attention_2",
192+
},
193+
eval_block_delay=EVAL_BLOCK_DELAY,
194+
epsilon_func=LinearDecay(0.005, 0.0002, 36000),
195+
max_bytes=29 * 1024 * 1024 * 1024,
196+
),
177197
}
178198

179199
# Schedule of competitions by block.
@@ -217,7 +237,6 @@
217237
0.4,
218238
),
219239
],
220-
221240
),
222241
]
223242

@@ -258,7 +277,7 @@
258277
sample_min = 5
259278
# Max number of uids that can be either pending eval or currently being evaluated.
260279
# We allow the sample_min per competition + 10 additional models to be held at any one time.
261-
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
280+
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_2) + 10
262281
# time required between updates to the chain.
263282
chain_update_cadence = dt.timedelta(minutes=20)
264283
# Number of blocks required between retrying evaluation of a model.

neurons/validator.py

+23-15
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import datetime as dt
2828
import functools
2929
import json
30+
import logging
3031
import math
3132
import os
3233
import pickle
@@ -35,8 +36,11 @@
3536
import traceback
3637
import typing
3738
from collections import defaultdict
39+
from retry import retry
3840

3941
import bittensor as bt
42+
from bittensor.utils.btlogging.helpers import all_loggers
43+
from bittensor.utils.btlogging.defines import BITTENSOR_LOGGER_NAME
4044
import torch
4145
import wandb
4246

@@ -126,8 +130,16 @@ def state_path(self) -> str:
126130

127131
def __init__(self):
128132
self.config = config.validator_config()
133+
# Manually default to info before overriding with arguments.
134+
# If this is not done then info logging does not work in the cases where other modes are not specified.
135+
bt.logging.set_info()
129136
bt.logging(config=self.config)
130137

138+
# Setting logging level on bittensor messes with all loggers, which we don't want, so set explicitly to warning here.
139+
for logger in all_loggers():
140+
if not logger.name.startswith(BITTENSOR_LOGGER_NAME):
141+
logger.setLevel(logging.WARNING)
142+
131143
bt.logging.info(f"Starting validator with config: {self.config}")
132144

133145
# === Bittensor objects ====
@@ -172,7 +184,7 @@ def __init__(self):
172184
self._new_wandb_run()
173185

174186
# === Running args ===
175-
self.weights = torch.zeros_like(torch.tensor(self.metagraph.S))
187+
self.weights = torch.zeros_like(torch.from_numpy(self.metagraph.S))
176188
self.global_step = 0
177189
self.last_epoch = self.metagraph.block.item()
178190

@@ -380,6 +392,9 @@ def update_models(self):
380392
# Track how recently we checked the list of top models.
381393
last_checked_top_models_time = None
382394

395+
# Delay the first update loop until the metagraph has been synced.
396+
time.sleep(60)
397+
383398
# The below loop iterates across all miner uids and checks to see
384399
# if they should be updated.
385400
while not self.stop_event.is_set():
@@ -713,7 +728,7 @@ async def _try_set_weights():
713728
netuid=self.config.netuid,
714729
wallet=self.wallet,
715730
uids=uids,
716-
weights=self.weights,
731+
weights=self.weights.numpy(),
717732
wait_for_inclusion=False,
718733
version_key=constants.weights_version_key,
719734
)
@@ -722,15 +737,6 @@ async def _try_set_weights():
722737
except:
723738
bt.logging.warning("Failed to set weights. Trying again later.")
724739

725-
ws, ui = self.weights.topk(len(self.weights))
726-
table = Table(title="All Weights")
727-
table.add_column("uid", justify="right", style="cyan", no_wrap=True)
728-
table.add_column("weight", style="magenta")
729-
for index, weight in list(zip(ui.tolist(), ws.tolist())):
730-
table.add_row(str(index), str(round(weight, 4)))
731-
console = Console()
732-
console.print(table)
733-
734740
try:
735741
bt.logging.debug(f"Setting weights.")
736742
await asyncio.wait_for(_try_set_weights(), ttl)
@@ -740,8 +746,12 @@ async def _try_set_weights():
740746

741747
def _get_current_block(self) -> int:
742748
"""Returns the current block."""
743-
try:
749+
@retry(tries=5, delay=1, backoff=2)
750+
def _get_block_with_retry():
744751
return self.subtensor.block
752+
753+
try:
754+
return _get_block_with_retry()
745755
except:
746756
bt.logging.debug(
747757
"Failed to get the latest block from the chain. Using the block from the cached metagraph."
@@ -854,8 +864,6 @@ async def run_step(self):
854864

855865
bt.logging.trace(f"Current block: {cur_block}")
856866

857-
858-
859867
if cur_block < constants.BLOCK_STACK_V2_DEDUP:
860868
dataset_by_competition_id = constants.DATASET_BY_COMPETITION_ID
861869
else:
@@ -1232,7 +1240,7 @@ def _compute_and_set_competition_weights(
12321240

12331241
# Fill in metagraph sized tensor with the step weights of the evaluated models.
12341242
with self.metagraph_lock:
1235-
competition_weights = torch.zeros_like(self.metagraph.S)
1243+
competition_weights = torch.zeros_like(torch.from_numpy(self.metagraph.S))
12361244

12371245
for i, uid_i in enumerate(uids):
12381246
competition_weights[uid_i] = step_weights[i]

pretrain/dataset.py

+18-18
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,17 @@
1212
from pprint import pprint
1313

1414
import os
15-
from dotenv import load_dotenv
15+
from dotenv import load_dotenv
1616
load_dotenv()
1717

1818
class SubsetLoader(IterableDataset):
1919
"""Base class for data-specific subset loader classes."""
20-
20+
2121
name: str = None # Dataset name
2222
rows_base_url: str = "https://datasets-server.huggingface.co/rows"
2323
size_base_url: str = "https://datasets-server.huggingface.co/size"
2424
max_pages: int = None
25-
25+
2626
def __init__(
2727
self,
2828
batch_size=None,
@@ -78,9 +78,9 @@ def __init__(
7878
self._initialize_pages()
7979
fetch_attempt += 1
8080

81-
# Exit if the buffer has at least one batch
81+
# Exit if the buffer has at least one batch
8282
if len(self.buffer) >= self.sequence_length:
83-
break
83+
break
8484

8585
bt.logging.warning(
8686
f"All fetched pages seem to be empty or have an extremely low token count. "
@@ -139,14 +139,14 @@ def _fetch_data_for_page(self, page):
139139
})
140140
else:
141141
self.params["offset"] = page
142-
142+
143143
self.params["length"] = self.num_rows_per_page
144-
144+
145145
attempt = 0
146146
while attempt < self.retry_limit:
147147
try:
148148
response = requests.get(
149-
self.rows_base_url,
149+
self.rows_base_url,
150150
params=self.params,
151151
headers=self._get_request_headers()
152152
)
@@ -183,9 +183,9 @@ def get_page_names(self):
183183
"""Get page names in consistent format"""
184184
if not hasattr(self, 'pages'):
185185
return []
186-
186+
187187
if isinstance(self.pages[0], tuple):
188-
return [f"{cfg_name}_{num_rows}_{split}"
188+
return [f"{cfg_name}_{num_rows}_{split}"
189189
for cfg_name, num_rows, split in self.pages]
190190
return self.pages
191191

@@ -257,15 +257,15 @@ def __init__(self, **kwargs):
257257
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])
258258

259259
self.s3_sess = session.client("s3")
260-
260+
261261
super().__init__(requires_auth=True, **kwargs)
262-
262+
263263

264264
def _download_row_content(self, blob_id, src_encoding):
265265
"""Download the row content from S3.
266266
"""
267-
268-
s3_url = f"s3://softwareheritage/content/{blob_id}"
267+
268+
s3_url = f"https://softwareheritage.s3.amazonaws.com/content/{blob_id}"
269269

270270
with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": self.s3_sess}) as fin:
271271
content = fin.read().decode(src_encoding)
@@ -277,7 +277,7 @@ def _get_content_from_row(self, row):
277277

278278
content = self._download_row_content(row['row']['blob_id'], row['row']['src_encoding'])
279279
return content
280-
280+
281281

282282
class SubsetFalconLoader(SubsetLoader):
283283
max_pages: int = 968000015
@@ -286,14 +286,14 @@ class SubsetFalconLoader(SubsetLoader):
286286

287287
class SubsetFineWebEdu2Loader(SubsetLoader):
288288
name: str = "HuggingFaceFW/fineweb-edu-score-2"
289-
289+
290290
def fetch_dataset_configs(self) -> typing.Dict[str, typing.Dict]:
291291
"""
292292
Fetch dataset configs and their metadata.
293293
Returns a dictionary with config names as keys and metadata as values.
294294
"""
295295
params = dict(dataset=self.name)
296-
296+
297297
attempt = 0
298298
while attempt < self.retry_limit:
299299
try:
@@ -385,7 +385,7 @@ def get_random_pages(self, num_pages, initial_offset):
385385
split = self.configs_data[config_name]["split"]
386386
pages.append((config_name, selected_page_start, split))
387387
return pages
388-
388+
389389
def fetch_data_to_rows(self, num_pages):
390390
"""Fetch data and return raw text rows instead of adding to buffer."""
391391
downloaded_pages = set()

requirements.txt

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
torch==2.4.1
2-
bittensor==6.9.4
2+
bittensor==8.4.3
33
huggingface-hub==0.25.2
44
matplotlib==3.9.2
5-
pydantic==1.10
65
python-dotenv==1.0.1
76
rich==13.9.2
87
safetensors==0.4.5
9-
numpy==2.1.2
8+
numpy==2.0.1
109
transformers==4.44.1
1110
wandb==0.18.3
1211
datasets==3.0.1
1312
flash-attn==2.6.3
1413
smart-open[s3]==7.0.5
15-
taoverse==1.0.9
14+
taoverse==1.3.1

0 commit comments

Comments
 (0)