layout | background-class | body-class | category | title | summary | image | author | tags | github-link | github-id | featured_image_1 | featured_image_2 | accelerator | demo-model-link | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
hub_detail |
hub-background |
hub |
researchers |
3D ResNet |
Resnet Style Video classification networks pretrained on the Kinetics 400 dataset |
slowfast.png |
FAIR PyTorchVideo |
|
facebookresearch/pytorchvideo |
no-image |
no-image |
βcuda-optionalβ |
λͺ¨λΈ λΆλ¬μ€κΈ°:
import torch
# `slow_r50` λͺ¨λΈ μ ν
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
λλ¨Έμ§ ν¨μλ€ λΆλ¬μ€κΈ°:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
CenterCropVideo,
NormalizeVideo,
)
from pytorchvideo.transforms import (
ApplyTransformToKey,
ShortSideScale,
UniformTemporalSubsample
)
λͺ¨λΈμ νκ° λͺ¨λλ‘ μ€μ νκ³ μνλ λλ°μ΄μ€ λ°©μμ μ νν©λλ€.
# GPU λλ CPU λ°©μμ μ€μ ν©λλ€.
device = "cpu"
model = model.eval()
model = model.to(device)
ν μΉ νλΈ λͺ¨λΈμ΄ νλ ¨λ Kinetics 400 λ°μ΄ν°μ μ λν΄ IDμμμ λ μ΄λΈκ³Ό λ§λ μ 보λ₯Ό λ€μ΄λ‘λν©λλ€. μ΄λ μμΈ‘λ ν΄λμ€ IDμμ μΉ΄ν κ³ λ¦¬ λ μ΄λΈ μ΄λ¦μ κ°μ Έμ€λλ° μ¬μ©λ©λλ€.
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)
with open(json_filename, "r") as f:
kinetics_classnames = json.load(f)
# λ μ΄λΈ μ΄λ¦κ³Ό λ§λ ID λ§λ€κΈ°
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
kinetics_id_to_classname[v] = str(k).replace('"', "")
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30
# μ΄ λ³νμ slow_R50 λͺ¨λΈμλ§ ν΄λΉλ©λλ€.
transform = ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(num_frames),
Lambda(lambda x: x/255.0),
NormalizeVideo(mean, std),
ShortSideScale(
size=side_size
),
CenterCropVideo(crop_size=(crop_size, crop_size))
]
),
)
# μ
λ ₯ ν΄λ¦½μ κΈΈμ΄λ λͺ¨λΈμ λ°λΌ λ¬λΌμ§λλ€.
clip_duration = (num_frames * sampling_rate)/frames_per_second
μμ μμμ λ€μ΄λ‘λν©λλ€.
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)
μμμ λΆλ¬μ€κ³ μ΄κ²μ λͺ¨λΈμ νμν μ λ ₯ νμμΌλ‘ λ³νν©λλ€.
# μμ λ° μ’
λ£ κ΅¬κ°μ μ§μ νμ¬ λΆλ¬μ¬ ν΄λ¦½μ κΈΈμ΄λ₯Ό μ νν©λλ€.
# start_secλ μμμμ νλμ΄ μμλλ μμΉμ μΌμΉν΄μΌν©λλ€.
start_sec = 0
end_sec = start_sec + clip_duration
# EncodedVideo helper ν΄λμ€λ₯Ό μ΄κΈ°ννκ³ μμμ λΆλ¬μ΅λλ€.
video = EncodedVideo.from_path(video_path)
# μνλ ν΄λ¦½μ λΆλ¬μ΅λλ€.
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
# λΉλμ€ μ
λ ₯μ μ κ·ννκΈ° μν΄ transform ν¨μλ₯Ό μ μ©ν©λλ€.
video_data = transform(video_data)
# μ
λ ₯μ μνλ λλ°μ΄μ€λ‘ μ΄λν©λλ€.
inputs = video_data["video"]
inputs = inputs.to(device)
# λͺ¨λΈμ ν΅ν΄ μ
λ ₯ ν΄λ¦½μ μ λ¬ν©λλ€.
preds = model(inputs[None, ...])
# μμΈ‘λ ν΄λμ€λ₯Ό κ°μ Έμ΅λλ€.
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]
# μμΈ‘λ ν΄λμ€λ₯Ό λ μ΄λΈ μ΄λ¦μ 맀νν©λλ€.
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))
λͺ¨λΈ μν€ν μ²λ Kinetics λ°μ΄ν°μ μ 8x8 μ€μ μ μ¬μ©νμ¬ μ¬μ νλ ¨λ κ°μ€μΉκ° μλ μ°Έκ³ λ¬Έν [1]μ κΈ°λ°μΌλ‘ ν©λλ€.
arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
---|---|---|---|---|---|---|
Slow | R50 | 8x8 | 74.58 | 91.63 | 54.52 | 32.45 |
[1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition" https://arxiv.org/pdf/1812.03982.pdf