Skip to content

Commit a545e86

Browse files
garymmmahdikhashan
authored andcommitted
Support old-style TensorFlow events (tensorboard) (kubeflow#2467)
* Support old-style TensorFlow events (tensorboard) Fixes: kubeflow#2466 Signed-off-by: Gary Miguel <[email protected]> * format Signed-off-by: Gary Miguel <[email protected]> * test Signed-off-by: Gary Miguel <[email protected]> * don't continue loops Signed-off-by: Gary Miguel <[email protected]> * format Signed-off-by: Gary Miguel <[email protected]> --------- Signed-off-by: Gary Miguel <[email protected]>
1 parent 2daece4 commit a545e86

File tree

3 files changed

+84
-34
lines changed

3 files changed

+84
-34
lines changed

pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py

+41-24
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,23 @@
3030
import rfc3339
3131
import tensorflow as tf
3232
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
33-
from tensorboard.backend.event_processing.tag_types import TENSORS
33+
from tensorboard.backend.event_processing.tag_types import SCALARS, TENSORS
3434

3535
from pkg.metricscollector.v1beta1.common import const
3636

3737

38+
def _should_consider(tag: str, metric_name: str, tfefile: str) -> bool:
39+
tfefile_parent_dir = (
40+
os.path.dirname(metric_name)
41+
if len(metric_name.split("/")) >= 2
42+
else os.path.dirname(tfefile)
43+
)
44+
basedir_name = os.path.dirname(tfefile)
45+
return tag.startswith(metric_name.split("/")[-1]) and basedir_name.endswith(
46+
tfefile_parent_dir
47+
)
48+
49+
3850
class TFEventFileParser:
3951
def __init__(self, metric_names):
4052
self.metric_names = metric_names
@@ -47,31 +59,36 @@ def find_all_files(directory):
4759

4860
def parse_summary(self, tfefile):
4961
metric_logs = []
50-
event_accumulator = EventAccumulator(tfefile, size_guidance={TENSORS: 0})
62+
event_accumulator = EventAccumulator(
63+
tfefile, size_guidance={SCALARS: 0, TENSORS: 0}
64+
)
5165
event_accumulator.Reload()
52-
for tag in event_accumulator.Tags()[TENSORS]:
66+
tags = event_accumulator.Tags()
67+
for tag in tags[TENSORS]:
5368
for m in self.metric_names:
54-
tfefile_parent_dir = (
55-
os.path.dirname(m)
56-
if len(m.split("/")) >= 2
57-
else os.path.dirname(tfefile)
58-
)
59-
basedir_name = os.path.dirname(tfefile)
60-
if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith(
61-
tfefile_parent_dir
62-
):
63-
continue
64-
65-
for tensor in event_accumulator.Tensors(tag):
66-
ml = api_pb2.MetricLog(
67-
time_stamp=rfc3339.rfc3339(
68-
datetime.fromtimestamp(tensor.wall_time)
69-
),
70-
metric=api_pb2.Metric(
71-
name=m, value=str(tf.make_ndarray(tensor.tensor_proto))
72-
),
73-
)
74-
metric_logs.append(ml)
69+
if _should_consider(tag, m, tfefile):
70+
for tensor in event_accumulator.Tensors(tag):
71+
ml = api_pb2.MetricLog(
72+
time_stamp=rfc3339.rfc3339(
73+
datetime.fromtimestamp(tensor.wall_time)
74+
),
75+
metric=api_pb2.Metric(
76+
name=m, value=str(tf.make_ndarray(tensor.tensor_proto))
77+
),
78+
)
79+
metric_logs.append(ml)
80+
# support old-style tensorboard metrics too
81+
for tag in tags[SCALARS]:
82+
for m in self.metric_names:
83+
if _should_consider(tag, m, tfefile):
84+
for scalar in event_accumulator.Scalars(tag):
85+
ml = api_pb2.MetricLog(
86+
time_stamp=rfc3339.rfc3339(
87+
datetime.fromtimestamp(scalar.wall_time)
88+
),
89+
metric=api_pb2.Metric(name=m, value=str(scalar.value)),
90+
)
91+
metric_logs.append(ml)
7592

7693
return metric_logs
7794

test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py

+42-10
Original file line numberDiff line numberDiff line change
@@ -13,35 +13,67 @@
1313
# limitations under the License.
1414

1515
import os
16+
import tempfile
1617
import unittest
1718

19+
import tensorboardX
1820
import utils
1921

22+
METRIC_DIR_NAMES = ("train", "test")
23+
METRIC_NAMES = ("accuracy", "loss")
24+
QUALIFIED_METRIC_NAMES = tuple(
25+
f"{dir}/{metric}"
26+
for dir in METRIC_DIR_NAMES
27+
for metric in METRIC_NAMES
28+
)
2029

2130
class TestTFEventMetricsCollector(unittest.TestCase):
2231
def test_parse_file(self):
2332

2433
current_dir = os.path.dirname(os.path.abspath(__file__))
2534
logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs")
2635

27-
# Metric format is "{{dirname}}/{{metrics name}}"
28-
metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"]
29-
metric_logs = utils.get_metric_logs(logs_dir, metric_names)
36+
37+
metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES)
3038
self.assertEqual(20, len(metric_logs))
3139

3240
for log in metric_logs:
3341
actual = log["metric"]["name"]
34-
self.assertIn(actual, metric_names)
42+
self.assertIn(actual, QUALIFIED_METRIC_NAMES)
43+
44+
train_metric_logs = utils.get_metric_logs(
45+
os.path.join(logs_dir, "train"), METRIC_NAMES)
46+
self.assertEqual(10, len(train_metric_logs))
47+
48+
for log in train_metric_logs:
49+
actual = log["metric"]["name"]
50+
self.assertIn(actual, METRIC_NAMES)
51+
52+
def test_parse_file_with_tensorboardX(self):
53+
logs_dir = tempfile.mkdtemp()
54+
num_iters = 3
3555

36-
# Metric format is "{{metrics name}}"
37-
metric_names = ["accuracy", "loss"]
38-
metrics_file_dir = os.path.join(logs_dir, "train")
39-
metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names)
40-
self.assertEqual(10, len(metric_logs))
56+
for dir_name in METRIC_DIR_NAMES:
57+
with tensorboardX.SummaryWriter(os.path.join(logs_dir, dir_name)) as writer:
58+
for metric_name in METRIC_NAMES:
59+
for iter in range(num_iters):
60+
writer.add_scalar(metric_name, 0.1, iter)
61+
62+
63+
metric_logs = utils.get_metric_logs(logs_dir, QUALIFIED_METRIC_NAMES)
64+
self.assertEqual(num_iters * len(QUALIFIED_METRIC_NAMES), len(metric_logs))
4165

4266
for log in metric_logs:
4367
actual = log["metric"]["name"]
44-
self.assertIn(actual, metric_names)
68+
self.assertIn(actual, QUALIFIED_METRIC_NAMES)
69+
70+
train_metric_logs = utils.get_metric_logs(
71+
os.path.join(logs_dir, "train"), METRIC_NAMES)
72+
self.assertEqual(num_iters * len(METRIC_NAMES), len(train_metric_logs))
73+
74+
for log in train_metric_logs:
75+
actual = log["metric"]["name"]
76+
self.assertIn(actual, METRIC_NAMES)
4577

4678

4779
if __name__ == '__main__':

test/unit/v1beta1/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
grpcio-testing==1.64.1
22
pytest==7.2.0
3+
tensorboardX==2.6.2.2
34
kubeflow-training[huggingface]==1.9.0

0 commit comments

Comments
 (0)