diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..240262e --- /dev/null +++ b/extract.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Andreas Goulas | first creation +# Nikolaos Gkalelis | 23/4/2021 | minor changes (main(), print, etc.) + +import argparse +import time +import os +import torch +from torch.utils.data import DataLoader + +from fcvid import FCVID +from ylimed import YLIMED +from model import GraphModule + +parser = argparse.ArgumentParser(description='GCN Video Classification') +parser.add_argument('model', nargs=1, help='trained model') +parser.add_argument('--gcn_layers', type=int, default=2, help='number of gcn layers') +parser.add_argument('--dataset', default='fcvid', choices=['fcvid', 'ylimed']) +parser.add_argument('--dataset_root', default=r'D:\Users\gkalelis\PycharmProjects\FCVID', help='dataset root directory') +parser.add_argument('--batch_size', type=int, default=64, help='batch size') +parser.add_argument('--num_workers', type=int, default=0, help='number of workers for data loader') +parser.add_argument('--save_folder', default='feats', help='directory to save features') +parser.add_argument('-v', '--verbose', action='store_true', help='show details') +args = parser.parse_args() + + +def main(): + if args.dataset == 'fcvid': + train_dataset = FCVID(args.dataset_root, is_train=True) + test_dataset = FCVID(args.dataset_root, is_train=False) + elif args.dataset == 'ylimed': + train_dataset = YLIMED(args.dataset_root, is_train=True) + test_dataset = YLIMED(args.dataset_root, is_train=False) + + device = torch.device('cuda:0') + if not os.path.exists(args.save_folder): + os.mkdir(args.save_folder) + + if args.verbose: + print("running on {}".format(device)) + print("num train samples={}".format(len(train_dataset))) + print("num test samples={}".format(len(test_dataset))) + print("missing videos={}".format(train_dataset.num_missing + test_dataset.num_missing)) + + out_dim = 2 * train_dataset.NUM_FEATS + model = GraphModule(args.gcn_layers, train_dataset.NUM_FEATS).to(device) + data = torch.load(args.model[0]) + model.load_state_dict(data) + + dataset_list = [('train', train_dataset), ('test', test_dataset)] + for phase, dataset in dataset_list: + num_samples = len(dataset) + loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) + out_feats = torch.zeros((num_samples, dataset.NUM_FRAMES, out_dim), dtype=torch.float32) + + t0 = time.perf_counter() + gidx = 0 + with torch.no_grad(): + for i, batch in enumerate(loader): + feats, feat_global, _, _ = batch + feats = feats.to(device) + + N, FR, B, NF = feats.shape + feats = feats.view(N * FR, B, NF) + out_data = model(feats, device).cpu() + out_data = out_data.view(N, FR, -1) + out_feat = torch.cat([out_data, feat_global], dim=-1) + + out_feats[gidx:gidx+N, :, :] = out_feat + gidx += N + + t1 = time.perf_counter() + + truth = torch.from_numpy(dataset.labels) + torch.save(out_feats, os.path.join(args.save_folder, 'feats-' + phase + '.pt')) + torch.save(truth, os.path.join(args.save_folder, 'truth-' + phase + '.pt')) + + if args.verbose: + print('phase {} dt={:.2f}sec'.format(phase, t1 - t0)) + + +if __name__ == '__main__': + main() diff --git a/fcvid.py b/fcvid.py new file mode 100644 index 0000000..8586111 --- /dev/null +++ b/fcvid.py @@ -0,0 +1,52 @@ +# Andreas Goulas +# Nikolaos Gkalelis | 23/4/2021 | minor changes (add object label/class information, text processing, etc.) + +import os +import numpy as np +from torch.utils.data import Dataset + +class FCVID(Dataset): + NUM_BOXES = 50 + NUM_FEATS = 2048 + NUM_CLASS = 239 + NUM_FRAMES = 9 + + def __init__(self, root_dir, is_train): + self.root_dir = root_dir + self.phase = 'train' if is_train else 'test' + + split_path = os.path.join(root_dir, 'materials', 'FCVID_VideoName_TrainTestSplit.txt') + data_split = np.genfromtxt(split_path, dtype='str') + + label_path = os.path.join(root_dir, 'materials', 'FCVID_Label.txt') + labels = np.genfromtxt(label_path, dtype=np.float32) + + self.num_missing = 0 + mask = np.zeros(data_split.shape[0], dtype=bool) + for i, row in enumerate(data_split): + if row[1] == self.phase: + base, _ = os.path.splitext(os.path.normpath(row[0])) + feats_path = os.path.join(root_dir, 'R152', base +'.npy') + if os.path.exists(feats_path): + mask[i] = 1 + else: + self.num_missing += 1 + + self.labels = labels[mask, :] + self.videos = data_split[mask, 0] + + def __len__(self): + return len(self.videos) + + def __getitem__(self, idx): + name = self.videos[idx] + name, _ = os.path.splitext(name) + + feats_path = os.path.join(self.root_dir, 'R152', name +'.npy') + global_path = os.path.join(self.root_dir, 'R152_global', name +'.npy') + feats = np.load(feats_path) + feat_global = np.load(global_path) + label = self.labels[idx, :] + + return (feats, feat_global, label, name) + diff --git a/model.py b/model.py new file mode 100644 index 0000000..0025284 --- /dev/null +++ b/model.py @@ -0,0 +1,77 @@ +# Andreas Goulas + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class GCNLayer(nn.Module): + def __init__(self, in_feats, out_feats): + super().__init__() + self.in_feats = in_feats + self.out_feats = out_feats + self.weight = nn.Parameter(torch.FloatTensor(in_feats, out_feats)) + self.norm = nn.LayerNorm(out_feats) + nn.init.xavier_uniform_(self.weight.data) + + def forward(self, x, adj): + x = x.matmul(self.weight) + x = adj.matmul(x) + x = self.norm(x) + x = F.relu(x) + return x + +class GraphModule(nn.Module): + def __init__(self, num_layers, num_feats): + super().__init__() + self.wq = nn.Linear(num_feats, num_feats) + self.wk = nn.Linear(num_feats, num_feats) + + layers = [] + for i in range(num_layers): + layers.append(GCNLayer(num_feats, num_feats)) + self.gcn = nn.ModuleList(layers) + + def forward(self, x, device, get_adj=False): + qx = self.wq(x) + kx = self.wk(x) + dot_mat = qx.matmul(kx.transpose(-1, -2)) + adj = F.normalize(dot_mat.square(), p=1, dim=-1) + + for layer in self.gcn: + x = layer(x, adj) + x = x.mean(dim=-2) + if get_adj is False: + return x + else: + return x, adj + +class Classifier(nn.Module): + def __init__(self, num_feats, num_hid, num_class): + super().__init__() + self.rnn = nn.LSTM(num_feats, num_feats, batch_first=True) + self.fc1 = nn.Linear(num_feats, num_hid) + self.fc2 = nn.Linear(num_hid, num_class) + self.drop = nn.Dropout() + + def forward(self, x): + x, _ = self.rnn(x) + x = x[:, -1, :] + x = F.relu(self.fc1(x)) + x = self.drop(x) + x = self.fc2(x) + return x + +class Model(nn.Module): + def __init__(self, gcn_layers, num_feats, num_class): + super().__init__() + self.graph = GraphModule(gcn_layers, num_feats) + self.cls = Classifier(2 * num_feats, num_feats, num_class) + + def forward(self, feats, feat_global, device): + N, FR, B, NF = feats.shape + feats = feats.view(N * FR, B, NF) + x = self.graph(feats, device) + x = x.view(N, FR, -1) + x = torch.cat([x, feat_global], dim=-1) + x = self.cls(x) + return x \ No newline at end of file diff --git a/save_gcn.py b/save_gcn.py new file mode 100644 index 0000000..7487fed --- /dev/null +++ b/save_gcn.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Andreas Goulas +# Nikolaos Gkalelis | 23/4/2021 | minor changes (main(), etc.) + +import argparse +import torch + +from fcvid import FCVID +from ylimed import YLIMED +from model import Model + +parser = argparse.ArgumentParser(description='GCN Video Classification') +parser.add_argument('in_model', nargs=1, help='trained model') +parser.add_argument('out_model', nargs=1, help='gcn model path') +parser.add_argument('--gcn_layers', type=int, default=2, help='number of gcn layers') +parser.add_argument('--dataset', default='fcvid', choices=['fcvid', 'ylimed']) +args = parser.parse_args() + +def main(): + if args.dataset == 'fcvid': + num_feats, num_class = FCVID.NUM_FEATS, FCVID.NUM_CLASS + elif args.dataset == 'ylimed': + num_feats, num_class = YLIMED.NUM_FEATS, YLIMED.NUM_CLASS + + model = Model(args.gcn_layers, num_feats, num_class) + data = torch.load(args.in_model[0]) + model.load_state_dict(data['model_state_dict']) + torch.save(model.graph.state_dict(), args.out_model[0]) + + +if __name__ == '__main__': + main() diff --git a/test.py b/test.py new file mode 100644 index 0000000..33de054 --- /dev/null +++ b/test.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# Andreas Goulas +# Nikolaos Gkalelis | 23/4/2021 | minor changes (text processing, etc.) + +import argparse +import time +import torch +from torch.utils.data import DataLoader +from sklearn.metrics import average_precision_score, accuracy_score + +from fcvid import FCVID +from ylimed import YLIMED +from model import Model + +parser = argparse.ArgumentParser(description='GCN Video Classification') +parser.add_argument('model', nargs=1, help='trained model') +parser.add_argument('--gcn_layers', type=int, default=2, help='number of gcn layers') +parser.add_argument('--dataset', default='fcvid', choices=['fcvid', 'ylimed']) +parser.add_argument('--dataset_root', default=r'D:\Users\gkalelis\PycharmProjects\FCVID', help='dataset root directory') +parser.add_argument('--batch_size', type=int, default=64, help='batch size') +parser.add_argument('--num_workers', type=int, default=4, help='number of workers for data loader') +parser.add_argument('--save_scores', action='store_true', help='save the output scores') +parser.add_argument('--save_path', default='scores.txt', help='output path') +parser.add_argument('-v', '--verbose', action='store_true', help='show details') +args = parser.parse_args() + +def test(model, dataset, loader, scores, out_file, device): + gidx = 0 + model.eval() + with torch.no_grad(): + for i, batch in enumerate(loader): + feats, feat_global, _, _ = batch + feats = feats.to(device) + feat_global = feat_global.to(device) + out_data = model(feats, feat_global, device) + + N = out_data.shape[0] + if out_file: + for j in range(N): + video_name = dataset.videos[gidx + j] + #out_file.write('%s ' % video_name) + out_file.write("{}".format(video_name)) + out_file.write(' '.join([str(x.item()) for x in out_data[j, :]])) + out_file.write('\n') + + scores[gidx:gidx+N, :] = out_data.cpu() + gidx += N + +def main(): + if args.dataset == 'fcvid': + dataset = FCVID(args.dataset_root, is_train=False) + elif args.dataset == 'ylimed': + dataset = YLIMED(args.dataset_root, is_train=False) + + device = torch.device('cuda:0') + loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + if args.verbose: + print("running on {}".format(device)) + print("num samples={}".format(len(dataset))) + print("missing videos={}".format(dataset.num_missing)) + + model = Model(args.gcn_layers, dataset.NUM_FEATS, dataset.NUM_CLASS).to(device) + data = torch.load(args.model[0]) + model.load_state_dict(data['model_state_dict']) + #model.load_state_dict(data) + + out_file = None + if args.save_scores: + out_file = open(args.save_path, 'w') + + num_test = len(dataset) + scores = torch.zeros((num_test, dataset.NUM_CLASS), dtype=torch.float32) + + t0 = time.perf_counter() + test(model, dataset, loader, scores, out_file, device) + t1 = time.perf_counter() + + scores = scores.numpy() + if args.save_scores: + out_file.close() + + if args.dataset == 'fcvid': + ap = average_precision_score(dataset.labels, scores) + print('mAP={:.2f}% dt={:.2f}sec'.format(100 * ap, t1 - t0)) + elif args.dataset == 'ylimed': + pred = scores.argmax(axis=1) + acc = accuracy_score(dataset.labels, pred) + print('accuracy={:.2f}% dt={:.2f}sec'.format(100 * acc, t1 - t0)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..26cc40c --- /dev/null +++ b/train.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# Andreas Goulas | first creation +# Nikolaos Gkalelis | 23/4/2021 | minor changes (main(), print, etc.) + +import argparse +import time +import os +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from fcvid import FCVID +from ylimed import YLIMED +from model import Model + +parser = argparse.ArgumentParser(description='GCN Video Classification') +parser.add_argument('--gcn_layers', type=int, default=2, help='number of gcn layers') +parser.add_argument('--dataset', default='fcvid', choices=['fcvid', 'ylimed']) +parser.add_argument('--dataset_root', default=r'D:\Users\gkalelis\PycharmProjects\FCVID', help='dataset root directory') +parser.add_argument('--lr', type=float, default=1e-4, help='initial learning rate') +parser.add_argument('--step_size', type=int, default=50, help='period of learning decay') +parser.add_argument('--num_epochs', type=int, default=60, help='number of epochs to train') +parser.add_argument('--batch_size', type=int, default=64, help='batch size') +parser.add_argument('--num_workers', type=int, default=4, help='number of workers for data loader') +parser.add_argument('--resume', default=None, help='checkpoint to resume training') +parser.add_argument('--save_interval', type=int, default=10, help='interval for saving models (epochs)') +parser.add_argument('--save_folder', default='weights', help='directory to save checkpoints') +parser.add_argument('-v', '--verbose', action='store_true', help='show details') +args = parser.parse_args() + +def train(model, loader, crit, opt, sched, device): + epoch_loss = 0 + for i, batch in enumerate(loader): + feats, feat_global, label, _ = batch + feats = feats.to(device) + feat_global = feat_global.to(device) + label = label.to(device) + + opt.zero_grad() + out_data = model(feats, feat_global, device) + loss = crit(out_data, label) + loss.backward() + opt.step() + + epoch_loss += loss.item() + + sched.step() + return epoch_loss / len(loader) + +def main(): + if not os.path.exists(args.save_folder): + os.mkdir(args.save_folder) + + if args.dataset == 'fcvid': + dataset = FCVID(args.dataset_root, is_train=True) + crit = nn.BCEWithLogitsLoss() + elif args.dataset == 'ylimed': + dataset = YLIMED(args.dataset_root, is_train=True) + crit = nn.CrossEntropyLoss() + + device = torch.device('cuda:0') + loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) + + if args.verbose: + print("running on {}".format(device)) + print("num samples={}".format(len(dataset))) + print("missing videos={}".format(dataset.num_missing)) + + start_epoch = 0 + model = Model(args.gcn_layers, dataset.NUM_FEATS, dataset.NUM_CLASS).to(device) + opt = optim.Adam(model.parameters(), lr=args.lr) + sched = optim.lr_scheduler.StepLR(opt, step_size=args.step_size, last_epoch=-1) + if args.resume: + data = torch.load(args.resume) + start_epoch = data['epoch'] + model.load_state_dict(data['model_state_dict']) + opt.load_state_dict(data['opt_state_dict']) + sched.load_state_dict(data['sched_state_dict']) + if args.verbose: + print("resuming from epoch {}".format(start_epoch)) + + model.train() + for epoch in range(start_epoch, args.num_epochs): + t0 = time.perf_counter() + loss = train(model, loader, crit, opt, sched, device) + t1 = time.perf_counter() + + if (epoch + 1) % args.save_interval == 0: + sfnametmpl = 'model-{}-{:03d}.pt' + sfname = sfnametmpl.format(args.dataset, epoch + 1) + spth = os.path.join(args.save_folder, sfname) + torch.save({ + 'epoch': epoch + 1, + 'loss': loss, + 'model_state_dict': model.state_dict(), + 'opt_state_dict': opt.state_dict(), + 'sched_state_dict': sched.state_dict() + }, spth) + + if args.verbose: + print("[epoch {}] loss={} dt={:.2f}sec".format(epoch + 1, loss, t1 - t0)) + +if __name__ == '__main__': + main() diff --git a/train_lstm.py b/train_lstm.py new file mode 100644 index 0000000..8a79a6b --- /dev/null +++ b/train_lstm.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Andreas Goulas + +import argparse +import time +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset +from sklearn.metrics import average_precision_score, accuracy_score +import os +import sys + +from fcvid import FCVID +from ylimed import YLIMED +from model import Classifier + +def train(model, loader, crit, opt, sched, device): + model.train() + epoch_loss = 0 + for i, batch in enumerate(loader): + feats, label = batch + feats = feats.to(device) + label = label.to(device) + + opt.zero_grad() + out_data = model(feats) + loss = crit(out_data, label) + loss.backward() + opt.step() + + epoch_loss += loss.item() + + sched.step() + return epoch_loss / len(loader) + +def test(model, loader, scores, device): + gidx = 0 + model.eval() + with torch.no_grad(): + for i, batch in enumerate(loader): + feats, _ = batch + feats = feats.to(device) + out_data = model(feats) + + N = out_data.shape[0] + scores[gidx:gidx+N, :] = out_data.cpu() + gidx += N + +parser = argparse.ArgumentParser(description='GCN Video Classification') +parser.add_argument('--dataset', default='fcvid', choices=['fcvid', 'ylimed']) +parser.add_argument('--feats_folder', default='feats', help='directory to load features') +parser.add_argument('--lr', type=float, default=1e-5, help='initial learning rate') +parser.add_argument('--gamma', type=float, default=1, help='learning rate decay rate') +parser.add_argument('--num_epochs', type=int, default=10, help='number of epochs to train') +parser.add_argument('--batch_size', type=int, default=16, help='batch size') +parser.add_argument('--num_workers', type=int, default=0, help='number of workers for data loader; set always to zero!') +parser.add_argument('--eval_interval', type=int, default=1, help='interval for evaluating models (epochs)') +parser.add_argument('-v', '--verbose', action='store_true', help='show details') +args = parser.parse_args() + +def main(): + + train_feats = torch.load(os.path.join(args.feats_folder, 'feats-train.pt')) + train_truth = torch.load(os.path.join(args.feats_folder, 'truth-train.pt')) + test_feats = torch.load(os.path.join(args.feats_folder, 'feats-test.pt')) + test_truth = torch.load(os.path.join(args.feats_folder, 'truth-test.pt')) + + if args.dataset == 'ylimed': + train_truth = train_truth.long() + test_truth = test_truth.long() + + train_dataset = TensorDataset(train_feats, train_truth) + test_dataset = TensorDataset(test_feats, test_truth) + + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + test_truth = test_truth.numpy() + + device = torch.device('cuda:0') + if args.verbose: + print('running on %s' % device) + print('num train samples=%d' % len(train_dataset)) + print('num test samples=%d' % len(test_dataset)) + + if args.dataset == 'fcvid': + crit = nn.BCEWithLogitsLoss() + num_feats, num_class = FCVID.NUM_FEATS, FCVID.NUM_CLASS + elif args.dataset == 'ylimed': + crit = nn.CrossEntropyLoss() + num_feats, num_class = YLIMED.NUM_FEATS, YLIMED.NUM_CLASS + else: + sys.exit("Unknown dataset!") + + start_epoch = 0 + model = Classifier(2 * num_feats, num_feats, num_class).to(device) + opt = optim.Adam(model.parameters(), lr=args.lr) + sched = optim.lr_scheduler.ExponentialLR(opt, args.gamma) + + for epoch in range(start_epoch, args.num_epochs): + t0 = time.perf_counter() + loss = train(model, train_loader, crit, opt, sched, device) + t1 = time.perf_counter() + + sched.step() + if args.verbose: + print('[epoch %d] loss=%f dt=%.2fsec' % (epoch + 1, loss, t1 - t0)) + + if (epoch + 1) % args.eval_interval == 0: + num_test = len(test_dataset) + scores = torch.zeros((num_test, num_class), dtype=torch.float32) + test(model, test_loader, scores, device) + scores = scores.numpy() + + if args.dataset == 'fcvid': + ap = average_precision_score(test_truth, scores) + print('mAP=%.2f%%' % (100 * ap)) + elif args.dataset == 'ylimed': + pred = scores.argmax(axis=1) + acc = accuracy_score(test_truth, pred) + print('accuracy=%.2f%%' % (100 * acc)) + + torch.save(pred, os.path.join(args.feats_folder, 'pred-test.pt')) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ylimed.py b/ylimed.py new file mode 100644 index 0000000..700ad66 --- /dev/null +++ b/ylimed.py @@ -0,0 +1,53 @@ +# Andreas Goulas +# Nikolaos Gkalelis | 23/4/2021 | minor changes (add object label/class information, text processing, etc.) + +import os +import numpy as np +from torch.utils.data import Dataset + +class YLIMED(Dataset): + NUM_BOXES = 50 + NUM_FEATS = 2048 + NUM_CLASS = 10 + NUM_FRAMES = 9 + + def __init__(self, root_dir, is_train): + self.root_dir = root_dir + self.phase = 'Training' if is_train else 'Test' + + split_path = os.path.join(root_dir, 'YLI-MED_Corpus_v.1.4.txt') + data_split = np.genfromtxt(split_path, dtype='str', skip_header=1) + + self.num_missing = 0 + mask = np.zeros(data_split.shape[0], dtype=bool) + for i, row in enumerate(data_split): + if row[7] == 'Ev100': + continue + + if row[13] == self.phase: + feats_path = os.path.join(root_dir, 'R152', row[0] + '.npy') + if os.path.exists(feats_path): + mask[i] = 1 + else: + self.num_missing += 1 + + self.videos = data_split[mask, 0] + labels = [int(x[3:])-1 for x in data_split[mask, 7]] + self.labels = np.array(labels, dtype=np.int32) + + def __len__(self): + return len(self.videos) + + def __getitem__(self, idx): + name = self.videos[idx] + name, _ = os.path.splitext(name) + + feats_path = os.path.join(self.root_dir, 'R152', name + '.npy') + global_path = os.path.join(self.root_dir, 'R152_global', name + '.npy') + + feats = np.load(feats_path) + feat_global = np.load(global_path) + label = np.int64(self.labels[idx]) + + return (feats, feat_global, label, name) +