From bf9b484056c59f9adc2ff219b08fdd93aa54494c Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Fri, 25 Apr 2025 10:12:49 +0100 Subject: [PATCH 01/12] Exploring cotracker online model options --- MANIFEST.in | 3 + notebook_cotracker_online.py | 307 +++++++++++++++++++++++++++++++++++ pyproject.toml | 4 + 3 files changed, 314 insertions(+) create mode 100644 notebook_cotracker_online.py diff --git a/MANIFEST.in b/MANIFEST.in index d5fb477d..d45a5b91 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -13,3 +13,6 @@ recursive-include docs *.md *.rst *.py # Include json schemas recursive-include ethology/annotations/json_schemas/schemas *.json recursive-include ethology/annotations/json_schemas/schemas *.md + +# Temporarily include notebooks +include notebook_cotracker_online.py diff --git a/notebook_cotracker_online.py b/notebook_cotracker_online.py new file mode 100644 index 00000000..f9378f84 --- /dev/null +++ b/notebook_cotracker_online.py @@ -0,0 +1,307 @@ +"""Multi-window approach for online tracking with CoTracker3. + +Todo: +- more query points? +- longer window better? +- overlapping window better? + +""" + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +import os +from datetime import datetime +from pathlib import Path + +import imageio.v3 as iio +import numpy as np +import torch +from movement.io import load_bboxes, load_poses, save_poses + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + +DEFAULT_DEVICE = ( + "cuda" + if torch.cuda.is_available() + else "mps" + if torch.backends.mps.is_available() + else "cpu" +) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Input video +video_path = Path( + "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test.mp4" +) + +window_half_length = 40 # in frames, overlapping, +step_between_query_frames = 100 + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Query points + +ground_truth_data = Path( + "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test_corrected_ST_SM_20241029_113207.csv" +) + +ds_gt = load_bboxes.from_file( + file_path=ground_truth_data, + source_software="VIA-tracks", + use_frame_numbers_from_file=False, +) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Select query points +# ------------------ +# Select one individual only +ds_gt_one = ds_gt.isel(individuals=[13 - 1]) # [3-1,13-1,35-1,57-1]) + +# Select all individuals +# ds_gt_one = ds_gt + +print(ds_gt_one) + +# Select frames +list_frames = list(range(ds_gt_one.sizes["time"])) +frames_to_select = np.array(list_frames)[ + ::step_between_query_frames +] # every N frame +print(frames_to_select) +# -------------------- + +# Prepare query points array +# it has frame as first column +queries_array = np.vstack( + [ + np.hstack( + [ + f + * np.ones((ds_gt_one.sizes["individuals"], 1)), # frame column + ds_gt_one.position.sel(time=f).values.T, # x, y columns + ] + ) + for f in range(ds_gt_one.sizes["time"]) + ] +) + +# Remove rows with nans in position +queries_array = queries_array[~np.any(np.isnan(queries_array), axis=1), :] + +# Selected queries +queries_sel = queries_array[ + [col in frames_to_select for col in queries_array[:, 0]], : +] +print(np.unique(queries_sel[:, 0])) + +# %% +# Convert to torch tensor +queries = torch.tensor(queries_sel) +queries = queries.to(torch.float).to(DEFAULT_DEVICE) + + +# %%%%%%%%%%%%%%%%%%% +# Load online model + +model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_online") + +# Set window length +model.model.window_len = window_half_length * 2 # in frames + +# Set step +model.step = window_half_length +print(model.step) # window is of width model.step * 2 + +# Move to GPU +model = model.to(DEFAULT_DEVICE) + +# model = CoTrackerOnlinePredictor( +# # checkpoint=None, +# checkpoint=( +# "/home/sminano/swc/project_ethology/tap_models_crabs/" +# "scaled_online.pth" +# ), +# window_len=2 * window_half_length, # in frames +# v2=False, +# ) + + +# %%%%%%%%%%%%%%% +# Process chunk function + + +def _process_step(window_frames, is_first_step, queries): + # Get a chunk of the video + video_chunk = ( + torch.tensor( + np.stack(window_frames[-model.step * 2 :]), device=DEFAULT_DEVICE + ) + .float() + .permute(0, 3, 1, 2)[None] + ) # (1, T, 3, H, W) + + # Process the video chunk with the model + return model( + video_chunk, + is_first_step=is_first_step, + queries=queries[None], + ) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Process video in non-overlapping chunks +window_frames: list[np.ndarray] = [] + +# Iterating over video frames, processing one window at a time: +is_first_step = True +video_iterator = iio.imiter(str(video_path), plugin="FFMPEG") +for i, frame in enumerate(video_iterator): + # Process a video chunk (non-overlapping right?) + if i % model.step == 0 and i != 0: + pred_tracks, pred_visibility = _process_step( + window_frames, + is_first_step, + queries=queries, + ) + is_first_step = False + + # append frame to window_frames + window_frames.append(frame) + + +# Processing the final video frames +# (in case video length is not a multiple of model.step) +pred_tracks, pred_visibility = _process_step( + window_frames[-(i % model.step) - model.step - 1 :], + is_first_step, + queries=queries, + # grid_query_frame=i +) + +print("Tracks are computed") + +# %% +print(pred_tracks) +print(pred_tracks.shape) # (1, T, N, 2) +print( + "pred_tracks in MB: " + f"{(pred_tracks.element_size() * pred_tracks.nelement()) / 1e6}" +) + +print(pred_visibility.shape) # (1, T, N, 1) +print( + "pred_visibility in MB: " + f"{(pred_visibility.element_size() * pred_visibility.nelement()) / 1e6}" +) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Save as a movement dataset +# (n_frames, n_space, n_keypoints, n_individuals) + +# assuming 1 query per individual in frame 0 +position_array = ( + pred_tracks.permute(1, -1, 0, -2).cpu().numpy() +) # (T, 2, 1, Nqueries) +visibility_array = pred_visibility.cpu().numpy()[0] # (T, Nqueries) + +# set to nan if non visible +# (improve this) +for i in range(visibility_array.shape[1]): + position_array[~visibility_array[:, i], :, :, i] = np.nan + +# ----------------------------- +# # get each track from its query point +# position_array_fix = np.vstack( +# [ +# position_array[ +# frames_to_select[i]:(frames_to_select[i+1] +# if i OOM + + +# # %% +# vis = Visualizer( +# save_dir="./output", +# linewidth=1, +# mode="cool", +# tracks_leave_trace=-1, +# fps=10, +# ) + +# vis.visualize( +# video, +# pred_tracks, # .to('cpu'), +# pred_visibility, +# query_frame=grid_query_frame, +# filename=f"queries_{timestamp}", +# ) + +# %% diff --git a/pyproject.toml b/pyproject.toml index 3fb07d7b..2648d341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,10 @@ classifiers = [ ] dependencies = [ "movement", + "torch", + "torchvision", + "cotracker @ git+https://github.com/facebookresearch/co-tracker.git", + "imageio[ffmpeg]", ] [project.urls] From 8be868ea5c79730ce73cdad80f5196b83d8834b3 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Fri, 25 Apr 2025 10:40:55 +0100 Subject: [PATCH 02/12] offline WIP --- notebook_cotracker_offline.py | 263 ++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 notebook_cotracker_offline.py diff --git a/notebook_cotracker_offline.py b/notebook_cotracker_offline.py new file mode 100644 index 00000000..7f5d9c82 --- /dev/null +++ b/notebook_cotracker_offline.py @@ -0,0 +1,263 @@ +# %% +# Imports +# import sleap_io as sio +import os +from datetime import datetime +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn.functional as F +from cotracker.utils.visualizer import read_video_from_path +from movement.io import load_bboxes, load_poses, save_poses + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + + +DEFAULT_DEVICE = ( + "cuda" + if torch.cuda.is_available() + else "mps" + if torch.backends.mps.is_available() + else "cpu" +) + +# %matplotlib widget + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Data paths +video_path = "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test.mp4" + +ground_truth_data = Path( + "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test_corrected_ST_SM_20241029_113207.csv" +) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Query points from gt + +ds_gt = load_bboxes.from_file( + file_path=ground_truth_data, + source_software="VIA-tracks", + use_frame_numbers_from_file=False, +) + +# Prepare query points array +# it has frame as first column +queries_array = np.vstack( + [ + np.hstack( + [ + f * np.ones((ds_gt.sizes["individuals"], 1)), # frame column + ds_gt.position.sel(time=f).values.T, # x, y columns + ] + ) + for f in range(ds_gt.sizes["time"]) + ] +) + +# Remove rows with nans in position +queries_array = queries_array[~np.any(np.isnan(queries_array), axis=1), :] + +# # Select frames +# list_frames = list(range(ds_gt.sizes["time"])) +# frames_to_select = np.array(list_frames)[ +# ::step_between_query_frames +# ] # every second frame +# queries_sel = queries_array[[col in frames_to_select for col in queries_array[:, 0]], :] + +# print(np.unique(queries_sel[:, 0])) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Read video +# TODO: is it faster with sleap_io? +video_full = read_video_from_path(video_path) +print(type(video_full)) +print(video_full.shape) # (614, 2160, 4096, 3) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Make it a torch tensor with dimensions in expected order +video_full = torch.from_numpy(video_full).permute(0, 3, 1, 2)[None] + +print(video_full.shape) # (1, 614, 3, 2160, 4096) +print(video_full.device) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Downsample frames +# out_frame_size = [216, 410] # 108, 205 +# video = F.interpolate(video[0], out_frame_size, mode="bilinear")[None] + +print(video_full.shape) # (1, 614, 3, 2160, 4096) +print(video_full.device) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Select first part of the video only (to fit in GPU) +# video = video[:, : video.shape[1] // 8] +chunk_start = 0 +video = video_full[:, chunk_start : chunk_start + 75, :, :, :] # 75 frames +print(video.shape) # (1, 307, 3, 2160, 4096) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Convert to float and place video on device +# Why do we need .float conversion? +# chatgpt: Mathematical operations like convolutions, normalizations, or matrix mults expect float32 or float16 + + +device = "cuda" +# video = video.float().to(device) +# video = video.half().to(device) # Use half precision for memory efficiency +# TODO: Make sure your video is normalized properly (video / 255.0) before converting to half() +video = video.to(torch.float).to(device) # torch.float16 + +# %% +# Check gpu memory usage +# print(torch.cuda.memory_summary()) +# %% +# Define query points +queries = torch.tensor( + [ + [0.0, 1070.1, 1697.1], + # if downsampled: [0.0, 97.09, 177.34], # point tracked from the first frame + [0.0, 980.7, 1762.2], + # if downsampled: [0.0, 106.20, 170.33], + # [113.0, 1961.00, 1665.00] + # [10.0, 600.0, 500.0], # frame number 10 + # [20.0, 750.0, 600.0], # ... + # [30.0, 900.0, 200.0], + ] +) + +# # Select all points at the first frame of the chunk +# queries = queries_array[queries_array[:, 0] == chunk_start, :] +# queries = queries[:1, :] +# queries = torch.tensor(queries) + +# Place query tensor on GPU +queries = queries.to(torch.float).to(device) # .half().to(device) torch.float16 + +# %% +# Visualize query points over frame + +# Create a list of frame numbers corresponding to each point +frame_numbers = queries[:, 0].int().unique().tolist() + +for frame_number in frame_numbers: + # get the query points for the current frame + queries_one_frame = queries[queries[:, 0] == frame_number] + + fig, ax = plt.subplots(1, 1) + # plot frame + ax.imshow( + video_full[frame_number, :, :] + ) # B T C H W -> H W C + # plot query points + ax.scatter( + x=queries_one_frame[:, 1].cpu(), y=queries_one_frame[:, 2].cpu(), s=5, c="red" + ) + + ax.set_title("Frame {}".format(frame_number)) + ax.set_xlim(0, video.shape[4]) + ax.set_ylim(0, video.shape[3]) + ax.invert_yaxis() + + +# %% +# Get Offline CoTracker model +model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_offline") + +# Use the model in half precision and move it to the GPU +# Note: this is for memory usage +model = model.to(device) # .half().to(device) # .to(torch.float16).to(device) + + +# %% +# all_half = all(p.dtype == torch.float16 for p in model.parameters()) +# print("All parameters are float16:", all_half) + +# for name, param in model.named_parameters(): +# # print(f"{name}: {param.dtype}") +# if param.dtype == torch.float32: +# param.data = param.data.to(torch.float16) +# print("PATATA") + +# for name, buffer in model.named_buffers(): +# print(f"{name}: {buffer.dtype}") + + +# %% +# Run CoTracker +pred_tracks, pred_visibility = model( + video, queries=queries[None], backward_tracking=True +) # B T N 2, B T N 1 + + +# from torch.cuda.amp import autocast +# model.eval() +# with torch.no_grad(), torch.autocast(device_type="cuda"): +# pred_tracks, pred_visibility = model( +# video, queries=queries[None], #backward_tracking=True +# ) # B T N 2, B T N 1 + +# %% +# TODO: Can I upsample the results to the original video res? +print(pred_tracks.shape) # (1, 307, 2, 2) --> Batch, Time, N of points, 2 (x,y) +print(pred_visibility.shape) + + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Save as a movement dataset + +# Assuming 1 query per individual + +# Prepare position array +# (n_frames, n_space, n_keypoints, n_individuals) +position_array = np.empty( + ds_gt.position.shape[:2] + (1,) + (ds_gt.position.shape[-1],) + # add keypoint dimension +) +position_array.fill(np.nan) +position_array[150 : 150 + 75, :, :, :] = ( + pred_tracks.permute(1, -1, 0, -2).cpu().numpy() +) + +ds = load_poses.from_numpy( + position_array=position_array, + individual_names=[f"ind_{i}" for i in range(pred_tracks.shape[2])], + keypoint_names=["centroid"], + source_software="CoTracker3", +) + + +# Export to read in napari +ds.attrs["source_file"] = "" + +# get string timestamp of today in yyyymmdd_hhmmss +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + +save_poses.to_sleap_analysis_file( + ds, + f"output/cotracker_offline_output_{timestamp}.h5", +) + + +# %% +# # Visualize results + +# vis = Visualizer( +# save_dir="./output", +# linewidth=1, +# mode="cool", +# tracks_leave_trace=-1, +# fps=10, +# ) + +# # Generate timestamp of today in format YYYYMMDD_HHMMSS +# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + +# # Save video with predictions +# vis.visualize(video, pred_tracks, pred_visibility, filename=f"queries_{timestamp}") + +# %% From a201812e58bbc3d7bf78c5e0c42e58ce3840456d Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:56:10 +0100 Subject: [PATCH 03/12] Trying offline querying one individual multiple frames --- notebook_cotracker_offline.py | 237 +++++++++++++++++++++------------- 1 file changed, 145 insertions(+), 92 deletions(-) diff --git a/notebook_cotracker_offline.py b/notebook_cotracker_offline.py index 7f5d9c82..c18e43f9 100644 --- a/notebook_cotracker_offline.py +++ b/notebook_cotracker_offline.py @@ -1,6 +1,5 @@ # %% # Imports -# import sleap_io as sio import os from datetime import datetime from pathlib import Path @@ -23,7 +22,7 @@ else "cpu" ) -# %matplotlib widget +%matplotlib widget # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Data paths @@ -33,9 +32,23 @@ "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test_corrected_ST_SM_20241029_113207.csv" ) +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Parmeters + +# query points +step_between_query_frames = 5 +individuals_gt_ids = [57] + +# downsample video +scale_factor = 0.25 + +# clip video +chunk_start = 0 +chunk_width = 75 + # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Query points from gt +# Select query points ds_gt = load_bboxes.from_file( file_path=ground_truth_data, @@ -43,62 +56,94 @@ use_frame_numbers_from_file=False, ) + +# ------------------ +# Select individuals to use as query points +if len(individuals_gt_ids) == 0: + ds_gt_one = ds_gt +else: + ds_gt_one = ds_gt.isel(individuals=[i - 1 for i in individuals_gt_ids]) + +print(ds_gt_one) + +# Select frames +list_frames = list(range(ds_gt_one.sizes["time"])) +frames_to_select = np.array(list_frames)[ + chunk_start:chunk_start + chunk_width:step_between_query_frames +] # every N frame +print(frames_to_select) +# -------------------- + # Prepare query points array # it has frame as first column queries_array = np.vstack( [ np.hstack( [ - f * np.ones((ds_gt.sizes["individuals"], 1)), # frame column - ds_gt.position.sel(time=f).values.T, # x, y columns + f + * np.ones((ds_gt_one.sizes["individuals"], 1)), # frame column + ds_gt_one.position.sel(time=f).values.T, # x, y columns ] ) - for f in range(ds_gt.sizes["time"]) + for f in range(ds_gt_one.sizes["time"]) ] ) # Remove rows with nans in position queries_array = queries_array[~np.any(np.isnan(queries_array), axis=1), :] -# # Select frames -# list_frames = list(range(ds_gt.sizes["time"])) -# frames_to_select = np.array(list_frames)[ -# ::step_between_query_frames -# ] # every second frame -# queries_sel = queries_array[[col in frames_to_select for col in queries_array[:, 0]], :] +# Filter selected query points +queries_sel = queries_array[ + [col in frames_to_select for col in queries_array[:, 0]], : +] + +print(queries_sel.shape) -# print(np.unique(queries_sel[:, 0])) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Downsample queries by the same scale factor as the video +queries_downsampled = queries_sel * scale_factor +queries_downsampled[:, 0] = queries_sel[:, 0] +print(queries_downsampled.shape) # torch.Size([1, 614, 2]) +print(queries_downsampled) + +# convert to torch tensor and place on device +queries_downsampled = torch.tensor(queries_downsampled).to(torch.float).to( + DEFAULT_DEVICE +) # .half().to(device) torch.float16 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Read video -# TODO: is it faster with sleap_io? +# TODO: is it faster with sleap_io? yes! but then converting to torch is very slow +# %time video_full = read_video_from_path(video_path) # Wall time: 13.4 s +# %time video_full = sio.load_video(video_path) # Wall time: 27.4 ms +# %time video_full = np.array(sio.load_video(video_path)) + video_full = read_video_from_path(video_path) print(type(video_full)) print(video_full.shape) # (614, 2160, 4096, 3) - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Make it a torch tensor with dimensions in expected order +# as torch tensor video_full = torch.from_numpy(video_full).permute(0, 3, 1, 2)[None] -print(video_full.shape) # (1, 614, 3, 2160, 4096) -print(video_full.device) - # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Downsample frames -# out_frame_size = [216, 410] # 108, 205 -# video = F.interpolate(video[0], out_frame_size, mode="bilinear")[None] +# Downsample video +video_downsampled = F.interpolate( + video_full[0], scale_factor=scale_factor, mode="bilinear" +)[None] + +print(video_downsampled.shape) # torch.Size([1, 614, 3, 540, 1024]) +print(video_downsampled.device) -print(video_full.shape) # (1, 614, 3, 2160, 4096) -print(video_full.device) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Select first part of the video only (to fit in GPU) # video = video[:, : video.shape[1] // 8] -chunk_start = 0 -video = video_full[:, chunk_start : chunk_start + 75, :, :, :] # 75 frames -print(video.shape) # (1, 307, 3, 2160, 4096) +video_downsampled_chunk = video_downsampled[ + :, chunk_start : chunk_start + chunk_width, :, :, : +] # 75 frames +print(video_downsampled_chunk.shape) # torch.Size([1, 75, 3, 540, 1024]) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Convert to float and place video on device @@ -110,61 +155,49 @@ # video = video.float().to(device) # video = video.half().to(device) # Use half precision for memory efficiency # TODO: Make sure your video is normalized properly (video / 255.0) before converting to half() -video = video.to(torch.float).to(device) # torch.float16 - -# %% -# Check gpu memory usage -# print(torch.cuda.memory_summary()) -# %% -# Define query points -queries = torch.tensor( - [ - [0.0, 1070.1, 1697.1], - # if downsampled: [0.0, 97.09, 177.34], # point tracked from the first frame - [0.0, 980.7, 1762.2], - # if downsampled: [0.0, 106.20, 170.33], - # [113.0, 1961.00, 1665.00] - # [10.0, 600.0, 500.0], # frame number 10 - # [20.0, 750.0, 600.0], # ... - # [30.0, 900.0, 200.0], - ] -) +video_downsampled_chunk = video_downsampled_chunk.to(torch.float).to( + device +) # torch.float16 -# # Select all points at the first frame of the chunk -# queries = queries_array[queries_array[:, 0] == chunk_start, :] -# queries = queries[:1, :] -# queries = torch.tensor(queries) -# Place query tensor on GPU -queries = queries.to(torch.float).to(device) # .half().to(device) torch.float16 - -# %% -# Visualize query points over frame +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Visualize query points over frames # Create a list of frame numbers corresponding to each point -frame_numbers = queries[:, 0].int().unique().tolist() +frame_numbers = queries_downsampled[:, 0].int().unique().tolist() for frame_number in frame_numbers: - # get the query points for the current frame - queries_one_frame = queries[queries[:, 0] == frame_number] - - fig, ax = plt.subplots(1, 1) - # plot frame - ax.imshow( - video_full[frame_number, :, :] - ) # B T C H W -> H W C - # plot query points - ax.scatter( - x=queries_one_frame[:, 1].cpu(), y=queries_one_frame[:, 2].cpu(), s=5, c="red" - ) + if frame_number in list(range(video_downsampled_chunk.shape[1])): + # get the query points for the current frame + queries_one_frame = queries_downsampled[ + queries_downsampled[:, 0] == frame_number + ] + + fig, ax = plt.subplots(1, 1) + # plot frame + ax.imshow( + video_downsampled_chunk.permute(0, 1, -2, -1, -3)[ + 0, frame_number, :, :, : + ] + .cpu() + .numpy() + .astype(np.int32) + ) # B T C H W -> H W C + # plot query points + ax.scatter( + x=queries_one_frame[:, 1].cpu(), + y=queries_one_frame[:, 2].cpu(), + s=5, + c="red", + ) - ax.set_title("Frame {}".format(frame_number)) - ax.set_xlim(0, video.shape[4]) - ax.set_ylim(0, video.shape[3]) - ax.invert_yaxis() + ax.set_title("Frame {}".format(frame_number)) + ax.set_xlim(0, video_downsampled_chunk.shape[4]) + ax.set_ylim(0, video_downsampled_chunk.shape[3]) + ax.invert_yaxis() -# %% +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Get Offline CoTracker model model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_offline") @@ -172,8 +205,9 @@ # Note: this is for memory usage model = model.to(device) # .half().to(device) # .to(torch.float16).to(device) +print(model.model.window_len) -# %% +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # all_half = all(p.dtype == torch.float16 for p in model.parameters()) # print("All parameters are float16:", all_half) @@ -187,10 +221,12 @@ # print(f"{name}: {buffer.dtype}") -# %% +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Run CoTracker pred_tracks, pred_visibility = model( - video, queries=queries[None], backward_tracking=True + video_downsampled_chunk, + queries=queries_downsampled[None], + backward_tracking=True, ) # B T N 2, B T N 1 @@ -206,26 +242,43 @@ print(pred_tracks.shape) # (1, 307, 2, 2) --> Batch, Time, N of points, 2 (x,y) print(pred_visibility.shape) - # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Save as a movement dataset +# Upsample results to the original video resolution -# Assuming 1 query per individual +pred_tracks_upsampled = pred_tracks*1 / scale_factor -# Prepare position array +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Save as a movement dataset # (n_frames, n_space, n_keypoints, n_individuals) -position_array = np.empty( - ds_gt.position.shape[:2] + (1,) + (ds_gt.position.shape[-1],) - # add keypoint dimension -) -position_array.fill(np.nan) -position_array[150 : 150 + 75, :, :, :] = ( - pred_tracks.permute(1, -1, 0, -2).cpu().numpy() -) + +# assuming 1 query is 1 individual +position_array = ( + pred_tracks_upsampled.permute(1, -1, 0, -2).cpu().numpy() +) # (T, 2, 1, Nqueries) +visibility_array = pred_visibility.cpu().numpy()[0] # (T, Nqueries) + +# set position to nan if non visible +# (improve this) +for i in range(visibility_array.shape[1]): + position_array[~visibility_array[:, i], :, :, i] = np.nan + +# ----------------------------- +# # get each track from its query point +# position_array_fix = np.vstack( +# [ +# position_array[ +# frames_to_select[i]:(frames_to_select[i+1] +# if i Date: Fri, 25 Apr 2025 12:14:08 +0100 Subject: [PATCH 04/12] Trying offline querying all individuals first frame --- MANIFEST.in | 1 + notebook_cotracker_offline.py | 74 +++++++++++++++-------------------- notebook_cotracker_online.py | 2 +- 3 files changed, 33 insertions(+), 44 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index d45a5b91..97086b6d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -16,3 +16,4 @@ recursive-include ethology/annotations/json_schemas/schemas *.md # Temporarily include notebooks include notebook_cotracker_online.py +include notebook_cotracker_offline.py diff --git a/notebook_cotracker_offline.py b/notebook_cotracker_offline.py index c18e43f9..ef424920 100644 --- a/notebook_cotracker_offline.py +++ b/notebook_cotracker_offline.py @@ -1,3 +1,5 @@ +"""Offline tracking with CoTracker3.""" + # %% # Imports import os @@ -22,29 +24,32 @@ else "cpu" ) -%matplotlib widget +# %matplotlib widget # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Data paths -video_path = "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test.mp4" +video_path = ( + "/home/sminano/swc/project_ethology/tap_models_crabs/" + "input/04.09.2023-04-Right_RE_test.mp4" +) ground_truth_data = Path( "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test_corrected_ST_SM_20241029_113207.csv" ) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Parmeters +# Parameters # query points -step_between_query_frames = 5 -individuals_gt_ids = [57] +step_between_query_frames: int = 1000 +individuals_gt_ids: list[int] = [] # downsample video -scale_factor = 0.25 +scale_factor: float = 0.25 # clip video -chunk_start = 0 -chunk_width = 75 +chunk_start: int = 0 +chunk_width = 100 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -69,7 +74,7 @@ # Select frames list_frames = list(range(ds_gt_one.sizes["time"])) frames_to_select = np.array(list_frames)[ - chunk_start:chunk_start + chunk_width:step_between_query_frames + chunk_start : chunk_start + chunk_width : step_between_query_frames ] # every N frame print(frames_to_select) # -------------------- @@ -108,14 +113,15 @@ print(queries_downsampled) # convert to torch tensor and place on device -queries_downsampled = torch.tensor(queries_downsampled).to(torch.float).to( - DEFAULT_DEVICE -) # .half().to(device) torch.float16 +queries_downsampled_tensor: torch.Tensor = ( + torch.from_numpy(queries_downsampled).to(torch.float).to(DEFAULT_DEVICE) +) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Read video -# TODO: is it faster with sleap_io? yes! but then converting to torch is very slow +# TODO: is it faster with sleap_io? yes! but then converting +# to torch is very slow # %time video_full = read_video_from_path(video_path) # Wall time: 13.4 s # %time video_full = sio.load_video(video_path) # Wall time: 27.4 ms # %time video_full = np.array(sio.load_video(video_path)) @@ -148,13 +154,13 @@ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Convert to float and place video on device # Why do we need .float conversion? -# chatgpt: Mathematical operations like convolutions, normalizations, or matrix mults expect float32 or float16 +# chatgpt: Mathematical operations like convolutions, normalizations, +# or matrix mults expect float32 or float16 device = "cuda" # video = video.float().to(device) # video = video.half().to(device) # Use half precision for memory efficiency -# TODO: Make sure your video is normalized properly (video / 255.0) before converting to half() video_downsampled_chunk = video_downsampled_chunk.to(torch.float).to( device ) # torch.float16 @@ -164,13 +170,13 @@ # Visualize query points over frames # Create a list of frame numbers corresponding to each point -frame_numbers = queries_downsampled[:, 0].int().unique().tolist() +frame_numbers = queries_downsampled_tensor[:, 0].unique().tolist() for frame_number in frame_numbers: if frame_number in list(range(video_downsampled_chunk.shape[1])): # get the query points for the current frame - queries_one_frame = queries_downsampled[ - queries_downsampled[:, 0] == frame_number + queries_one_frame = queries_downsampled_tensor[ + queries_downsampled_tensor[:, 0] == frame_number ] fig, ax = plt.subplots(1, 1) @@ -191,7 +197,7 @@ c="red", ) - ax.set_title("Frame {}".format(frame_number)) + ax.set_title(f"Frame {frame_number}") ax.set_xlim(0, video_downsampled_chunk.shape[4]) ax.set_ylim(0, video_downsampled_chunk.shape[3]) ax.invert_yaxis() @@ -224,8 +230,8 @@ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Run CoTracker pred_tracks, pred_visibility = model( - video_downsampled_chunk, - queries=queries_downsampled[None], + video_downsampled_chunk, + queries=queries_downsampled_tensor[None], backward_tracking=True, ) # B T N 2, B T N 1 @@ -239,13 +245,15 @@ # %% # TODO: Can I upsample the results to the original video res? -print(pred_tracks.shape) # (1, 307, 2, 2) --> Batch, Time, N of points, 2 (x,y) +print( + pred_tracks.shape +) # (1, 307, 2, 2) --> Batch, Time, N of points, 2 (x,y) print(pred_visibility.shape) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Upsample results to the original video resolution -pred_tracks_upsampled = pred_tracks*1 / scale_factor +pred_tracks_upsampled = pred_tracks * 1 / scale_factor # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Save as a movement dataset @@ -294,23 +302,3 @@ ds, f"../tap_models_crabs/output/cotracker_offline_output_{timestamp}.h5", ) - - -# %% -# # Visualize results - -# vis = Visualizer( -# save_dir="./output", -# linewidth=1, -# mode="cool", -# tracks_leave_trace=-1, -# fps=10, -# ) - -# # Generate timestamp of today in format YYYYMMDD_HHMMSS -# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - -# # Save video with predictions -# vis.visualize(video, pred_tracks, pred_visibility, filename=f"queries_{timestamp}") - -# %% diff --git a/notebook_cotracker_online.py b/notebook_cotracker_online.py index f9378f84..5896a09b 100644 --- a/notebook_cotracker_online.py +++ b/notebook_cotracker_online.py @@ -1,4 +1,4 @@ -"""Multi-window approach for online tracking with CoTracker3. +"""Sliding-window approach for online tracking with CoTracker3. Todo: - more query points? From f959ee90f858396dda3099a44b71832cd7813372 Mon Sep 17 00:00:00 2001 From: AnandMayank Date: Wed, 18 Feb 2026 18:53:40 +0530 Subject: [PATCH 05/12] feat(ethology): add lightweight trajectory re-identification utility and BoxMOT-style model loading\n\n- Add reid module with motion-based re-identification\n- Refactor OSNet and TensorRT model loading to match BoxMOT approach (auto-download/cache)\n- Add unit tests for reid utility\n\nThis closes the gap for robust ID handling in long ethology recordings and improves reproducibility for all users. --- ethology/reid/__init__.py | 1 + ethology/reid/backbones/__init__.py | 1 + ethology/reid/backbones/hacnn.py | 292 ++++++++++++++ ethology/reid/backbones/mlfn.py | 239 ++++++++++++ ethology/reid/backbones/mobilenetv2.py | 247 ++++++++++++ ethology/reid/backbones/osnet.py | 345 +++++++++++++++++ ethology/reid/backbones/osnet_ain.py | 356 ++++++++++++++++++ ethology/reid/backbones/resnet.py | 273 ++++++++++++++ ethology/reid/backends/__init__.py | 1 + ethology/reid/backends/base_backend.py | 170 +++++++++ ethology/reid/backends/onnx_backend.py | 31 ++ ethology/reid/backends/openvino_backend.py | 48 +++ ethology/reid/backends/pytorch_backend.py | 20 + ethology/reid/backends/tensorrt_backend.py | 310 +++++++++++++++ ethology/reid/backends/tflite_backend.py | 40 ++ ethology/reid/backends/torchscript_backend.py | 20 + ethology/reid/core/__init__.py | 1 + ethology/reid/core/auto_backend.py | 74 ++++ ethology/reid/core/config.py | 16 + ethology/reid/core/factory.py | 30 ++ ethology/reid/core/handler.py | 33 ++ ethology/reid/core/registry.py | 71 ++++ ethology/reid/core/reid_handler.py | 28 ++ tests/test_unit/test_reid_handler.py | 12 + 24 files changed, 2659 insertions(+) create mode 100644 ethology/reid/__init__.py create mode 100644 ethology/reid/backbones/__init__.py create mode 100644 ethology/reid/backbones/hacnn.py create mode 100644 ethology/reid/backbones/mlfn.py create mode 100644 ethology/reid/backbones/mobilenetv2.py create mode 100644 ethology/reid/backbones/osnet.py create mode 100644 ethology/reid/backbones/osnet_ain.py create mode 100644 ethology/reid/backbones/resnet.py create mode 100644 ethology/reid/backends/__init__.py create mode 100644 ethology/reid/backends/base_backend.py create mode 100644 ethology/reid/backends/onnx_backend.py create mode 100644 ethology/reid/backends/openvino_backend.py create mode 100644 ethology/reid/backends/pytorch_backend.py create mode 100644 ethology/reid/backends/tensorrt_backend.py create mode 100644 ethology/reid/backends/tflite_backend.py create mode 100644 ethology/reid/backends/torchscript_backend.py create mode 100644 ethology/reid/core/__init__.py create mode 100644 ethology/reid/core/auto_backend.py create mode 100644 ethology/reid/core/config.py create mode 100644 ethology/reid/core/factory.py create mode 100644 ethology/reid/core/handler.py create mode 100644 ethology/reid/core/registry.py create mode 100644 ethology/reid/core/reid_handler.py create mode 100644 tests/test_unit/test_reid_handler.py diff --git a/ethology/reid/__init__.py b/ethology/reid/__init__.py new file mode 100644 index 00000000..d2f6d334 --- /dev/null +++ b/ethology/reid/__init__.py @@ -0,0 +1 @@ +# ReID module for ethology diff --git a/ethology/reid/backbones/__init__.py b/ethology/reid/backbones/__init__.py new file mode 100644 index 00000000..8e23892b --- /dev/null +++ b/ethology/reid/backbones/__init__.py @@ -0,0 +1 @@ +# Backbones for ReID models diff --git a/ethology/reid/backbones/hacnn.py b/ethology/reid/backbones/hacnn.py new file mode 100644 index 00000000..e48ea2b7 --- /dev/null +++ b/ethology/reid/backbones/hacnn.py @@ -0,0 +1,292 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +from __future__ import absolute_import, division + +import torch +from torch import nn +from torch.nn import functional as F + +__all__ = ["HACNN"] + + +class ConvBlock(nn.Module): + def __init__(self, in_c, out_c, k, s=1, p=0): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) + self.bn = nn.BatchNorm2d(out_c) + def forward(self, x): + return F.relu(self.bn(self.conv(x))) + +class InceptionA(nn.Module): + def __init__(self, in_channels, out_channels): + super(InceptionA, self).__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream3 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream4 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1), + ConvBlock(in_channels, mid_channels, 1), + ) + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + s4 = self.stream4(x) + y = torch.cat([s1, s2, s3, s4], dim=1) + return y + +class InceptionB(nn.Module): + def __init__(self, in_channels, out_channels): + super(InceptionB, self).__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream3 = nn.Sequential( + nn.MaxPool2d(3, stride=2, padding=1), + ConvBlock(in_channels, mid_channels * 2, 1), + ) + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + y = torch.cat([s1, s2, s3], dim=1) + return y + +class SpatialAttn(nn.Module): + def __init__(self): + super(SpatialAttn, self).__init__() + self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) + self.conv2 = ConvBlock(1, 1, 1) + def forward(self, x): + x = x.mean(1, keepdim=True) + x = self.conv1(x) + x = F.interpolate( + x, (x.size(2) * 2, x.size(3) * 2), mode="bilinear", align_corners=True + ) + x = self.conv2(x) + return x + +class ChannelAttn(nn.Module): + def __init__(self, in_channels, reduction_rate=16): + super(ChannelAttn, self).__init__() + assert in_channels % reduction_rate == 0 + self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) + self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]) + x = self.conv1(x) + x = self.conv2(x) + return x + +class SoftAttn(nn.Module): + def __init__(self, in_channels): + super(SoftAttn, self).__init__() + self.spatial_attn = SpatialAttn() + self.channel_attn = ChannelAttn(in_channels) + self.conv = ConvBlock(in_channels, in_channels, 1) + def forward(self, x): + y_spatial = self.spatial_attn(x) + y_channel = self.channel_attn(x) + y = y_spatial * y_channel + y = torch.sigmoid(self.conv(y)) + return y + +class HardAttn(nn.Module): + def __init__(self, in_channels): + super(HardAttn, self).__init__() + self.fc = nn.Linear(in_channels, 4 * 2) + self.init_params() + def init_params(self): + self.fc.weight.data.zero_() + self.fc.bias.data.copy_( + torch.tensor([0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float) + ) + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) + theta = torch.tanh(self.fc(x)) + theta = theta.view(-1, 4, 2) + return theta + +class HarmAttn(nn.Module): + def __init__(self, in_channels): + super(HarmAttn, self).__init__() + self.soft_attn = SoftAttn(in_channels) + self.hard_attn = HardAttn(in_channels) + def forward(self, x): + y_soft_attn = self.soft_attn(x) + theta = self.hard_attn(x) + return y_soft_attn, theta + +class HACNN(nn.Module): + def __init__( + self, + num_classes, + loss="softmax", + nchannels=[128, 256, 384], + feat_dim=512, + learn_region=True, + use_gpu=True, + **kwargs, + ): + super(HACNN, self).__init__() + self.loss = loss + self.learn_region = learn_region + self.use_gpu = use_gpu + self.conv = ConvBlock(3, 32, 3, s=2, p=1) + self.inception1 = nn.Sequential( + InceptionA(32, nchannels[0]), + InceptionB(nchannels[0], nchannels[0]), + ) + self.ha1 = HarmAttn(nchannels[0]) + self.inception2 = nn.Sequential( + InceptionA(nchannels[0], nchannels[1]), + InceptionB(nchannels[1], nchannels[1]), + ) + self.ha2 = HarmAttn(nchannels[1]) + self.inception3 = nn.Sequential( + InceptionA(nchannels[1], nchannels[2]), + InceptionB(nchannels[2], nchannels[2]), + ) + self.ha3 = HarmAttn(nchannels[2]) + self.fc_global = nn.Sequential( + nn.Linear(nchannels[2], feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_global = nn.Linear(feat_dim, num_classes) + if self.learn_region: + self.init_scale_factors() + self.local_conv1 = InceptionB(32, nchannels[0]) + self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) + self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) + self.fc_local = nn.Sequential( + nn.Linear(nchannels[2] * 4, feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_local = nn.Linear(feat_dim, num_classes) + self.feat_dim = feat_dim * 2 + else: + self.feat_dim = feat_dim + def init_scale_factors(self): + self.scale_factors = [] + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + def stn(self, x, theta): + grid = F.affine_grid(theta, x.size()) + x = F.grid_sample(x, grid) + return x + def transform_theta(self, theta_i, region_idx): + scale_factors = self.scale_factors[region_idx] + theta = torch.zeros(theta_i.size(0), 2, 3) + theta[:, :, :2] = scale_factors + theta[:, :, -1] = theta_i + if self.use_gpu: + theta = theta.to(next(self.parameters()).device) + return theta + def forward(self, x): + assert ( + x.size(2) == 160 and x.size(3) == 64 + ), "Input size does not match, expected (160, 64) but got ({}, {})".format( + x.size(2), x.size(3) + ) + x = self.conv(x) + x1 = self.inception1(x) + x1_attn, x1_theta = self.ha1(x1) + x1_out = x1 * x1_attn + if self.learn_region: + x1_local_list = [] + for region_idx in range(4): + x1_theta_i = x1_theta[:, region_idx, :] + x1_theta_i = self.transform_theta(x1_theta_i, region_idx) + x1_trans_i = self.stn(x, x1_theta_i) + x1_trans_i = F.interpolate( + x1_trans_i, (24, 28), mode="bilinear", align_corners=True + ) + x1_local_i = self.local_conv1(x1_trans_i) + x1_local_list.append(x1_local_i) + x2 = self.inception2(x1_out) + x2_attn, x2_theta = self.ha2(x2) + x2_out = x2 * x2_attn + if self.learn_region: + x2_local_list = [] + for region_idx in range(4): + x2_theta_i = x2_theta[:, region_idx, :] + x2_theta_i = self.transform_theta(x2_theta_i, region_idx) + x2_trans_i = self.stn(x1_out, x2_theta_i) + x2_trans_i = F.interpolate( + x2_trans_i, (12, 14), mode="bilinear", align_corners=True + ) + x2_local_i = x2_trans_i + x1_local_list[region_idx] + x2_local_i = self.local_conv2(x2_local_i) + x2_local_list.append(x2_local_i) + x3 = self.inception3(x2_out) + x3_attn, x3_theta = self.ha3(x3) + x3_out = x3 * x3_attn + if self.learn_region: + x3_local_list = [] + for region_idx in range(4): + x3_theta_i = x3_theta[:, region_idx, :] + x3_theta_i = self.transform_theta(x3_theta_i, region_idx) + x3_trans_i = self.stn(x2_out, x3_theta_i) + x3_trans_i = F.interpolate( + x3_trans_i, (6, 7), mode="bilinear", align_corners=True + ) + x3_local_i = x3_trans_i + x2_local_list[region_idx] + x3_local_i = self.local_conv3(x3_local_i) + x3_local_list.append(x3_local_i) + x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( + x3_out.size(0), x3_out.size(1) + ) + x_global = self.fc_global(x_global) + if self.learn_region: + x_local_list = [] + for region_idx in range(4): + x_local_i = x3_local_list[region_idx] + x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( + x_local_i.size(0), -1 + ) + x_local_list.append(x_local_i) + x_local = torch.cat(x_local_list, 1) + x_local = self.fc_local(x_local) + if not self.training: + if self.learn_region: + x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) + x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) + return torch.cat([x_global, x_local], 1) + else: + return x_global + prelogits_global = self.classifier_global(x_global) + if self.learn_region: + prelogits_local = self.classifier_local(x_local) + if self.loss == "softmax": + if self.learn_region: + return (prelogits_global, prelogits_local) + else: + return prelogits_global + elif self.loss == "triplet": + if self.learn_region: + return (prelogits_global, prelogits_local), (x_global, x_local) + else: + return prelogits_global, x_global + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py new file mode 100644 index 00000000..bb1e235c --- /dev/null +++ b/ethology/reid/backbones/mlfn.py @@ -0,0 +1,239 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +from __future__ import absolute_import, division +import torch +import torch.utils.model_zoo as model_zoo +from torch import nn +from torch.nn import functional as F + +__all__ = ["mlfn"] +model_urls = { + # training epoch = 5, top1 = 51.6 + "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", +} + + +class MLFNBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, fsm_channels, groups=32): + super(MLFNBlock, self).__init__() + self.groups = groups + mid_channels = out_channels // 2 + + # Factor Modules + self.fm_conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False) + self.fm_bn1 = nn.BatchNorm2d(mid_channels) + self.fm_conv2 = nn.Conv2d( + mid_channels, + mid_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=self.groups, + ) + self.fm_bn2 = nn.BatchNorm2d(mid_channels) + self.fm_conv3 = nn.Conv2d(mid_channels, out_channels, 1, bias=False) + self.fm_bn3 = nn.BatchNorm2d(out_channels) + + # Factor Selection Module + self.fsm = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, fsm_channels[0], 1), + nn.BatchNorm2d(fsm_channels[0]), + nn.ReLU(inplace=True), + nn.Conv2d(fsm_channels[0], fsm_channels[1], 1), + nn.BatchNorm2d(fsm_channels[1]), + nn.ReLU(inplace=True), + nn.Conv2d(fsm_channels[1], self.groups, 1), + nn.BatchNorm2d(self.groups), + nn.Sigmoid(), + ) + + self.downsample = None + if in_channels != out_channels or stride > 1: + self.downsample = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + residual = x + s = self.fsm(x) + + # reduce dimension + x = self.fm_conv1(x) + x = self.fm_bn1(x) + x = F.relu(x, inplace=True) + + # group convolution + x = self.fm_conv2(x) + x = self.fm_bn2(x) + x = F.relu(x, inplace=True) + + # factor selection + b, c = x.size(0), x.size(1) + n = c // self.groups + ss = s.repeat(1, n, 1, 1) # from (b, g, 1, 1) to (b, g*n=c, 1, 1) + ss = ss.view(b, n, self.groups, 1, 1) + ss = ss.permute(0, 2, 1, 3, 4).contiguous() + ss = ss.view(b, c, 1, 1) + x = ss * x + + # recover dimension + x = self.fm_conv3(x) + x = self.fm_bn3(x) + x = F.relu(x, inplace=True) + + if self.downsample is not None: + residual = self.downsample(residual) + + return F.relu(residual + x, inplace=True), s + + +class MLFN(nn.Module): + """Multi-Level Factorisation Net. + + Reference: + Chang et al. Multi-Level Factorisation Net for + Person Re-Identification. CVPR 2018. + + Public keys: + - ``mlfn``: MLFN (Multi-Level Factorisation Net). + """ + + def __init__( + self, + num_classes, + loss="softmax", + groups=32, + channels=[64, 256, 512, 1024, 2048], + embed_dim=1024, + **kwargs, + ): + super(MLFN, self).__init__() + self.loss = loss + self.groups = groups + + # first convolutional layer + self.conv1 = nn.Conv2d(3, channels[0], 7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(channels[0]) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + + # main body + self.feature = nn.ModuleList( + [ + # layer 1-3 + MLFNBlock(channels[0], channels[1], 1, [128, 64], self.groups), + MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), + MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), + # layer 4-7 + MLFNBlock(channels[1], channels[2], 2, [256, 128], self.groups), + MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), + MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), + MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), + # layer 8-13 + MLFNBlock(channels[2], channels[3], 2, [512, 128], self.groups), + MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), + MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), + MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), + MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), + MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), + # layer 14-16 + MLFNBlock(channels[3], channels[4], 2, [512, 128], self.groups), + MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups), + MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups), + ] + ) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + + # projection functions + self.fc_x = nn.Sequential( + nn.Conv2d(channels[4], embed_dim, 1, bias=False), + nn.BatchNorm2d(embed_dim), + nn.ReLU(inplace=True), + ) + self.fc_s = nn.Sequential( + nn.Conv2d(self.groups * 16, embed_dim, 1, bias=False), + nn.BatchNorm2d(embed_dim), + nn.ReLU(inplace=True), + ) + + self.classifier = nn.Linear(embed_dim, num_classes) + + self.init_params() + + def init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x, inplace=True) + x = self.maxpool(x) + + s_hat = [] + for block in self.feature: + x, s = block(x) + s_hat.append(s) + s_hat = torch.cat(s_hat, 1) + + x = self.global_avgpool(x) + x = self.fc_x(x) + s_hat = self.fc_s(s_hat) + + v = (x + s_hat) * 0.5 + v = v.view(v.size(0), -1) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + + +def init_pretrained_weights(model, model_url): + """Initializes model with pretrained weights. + + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) + + +def mlfn(num_classes, loss="softmax", pretrained=True, **kwargs): + model = MLFN(num_classes, loss, **kwargs) + if pretrained: + # init_pretrained_weights(model, model_urls['imagenet']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["imagenet"] + ) + ) + return model +# Copied from boxmot/boxmot/reid/backbones/mlfn.py diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py new file mode 100644 index 00000000..35a16219 --- /dev/null +++ b/ethology/reid/backbones/mobilenetv2.py @@ -0,0 +1,247 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +from __future__ import absolute_import, division + +import torch.utils.model_zoo as model_zoo +from torch import nn +from torch.nn import functional as F + +__all__ = ["mobilenetv2_x1_0", "mobilenetv2_x1_4"] + +model_urls = { + # 1.0: top-1 71.3 + "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", + # 1.4: top-1 73.9 + "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", +} + + +class ConvBlock(nn.Module): + """Basic convolutional block. + + convolution (bias discarded) + batch normalization + relu6. + + Args: + in_c (int): number of input channels. + out_c (int): number of output channels. + k (int or tuple): kernel size. + s (int or tuple): stride. + p (int or tuple): padding. + g (int): number of blocked connections from input channels + to output channels (default: 1). + """ + + def __init__(self, in_c, out_c, k, s=1, p=0, g=1): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) + self.bn = nn.BatchNorm2d(out_c) + + def forward(self, x): + return F.relu6(self.bn(self.conv(x))) + + +class Bottleneck(nn.Module): + def __init__(self, in_channels, out_channels, expansion_factor, stride=1): + super(Bottleneck, self).__init__() + mid_channels = in_channels * expansion_factor + self.use_residual = stride == 1 and in_channels == out_channels + self.conv1 = ConvBlock(in_channels, mid_channels, 1) + self.dwconv2 = ConvBlock( + mid_channels, mid_channels, 3, stride, 1, g=mid_channels + ) + self.conv3 = nn.Sequential( + nn.Conv2d(mid_channels, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + m = self.conv1(x) + m = self.dwconv2(m) + m = self.conv3(m) + if self.use_residual: + return x + m + else: + return m + + +class MobileNetV2(nn.Module): + """MobileNetV2. + + Reference: + Sandler et al. MobileNetV2: Inverted Residuals and + Linear Bottlenecks. CVPR 2018. + + Public keys: + - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. + - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. + """ + + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super(MobileNetV2, self).__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + + # construct layers + self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) + self.conv2 = self._make_layer(Bottleneck, 1, int(16 * width_mult), 1, 1) + self.conv3 = self._make_layer(Bottleneck, 6, int(24 * width_mult), 2, 2) + self.conv4 = self._make_layer(Bottleneck, 6, int(32 * width_mult), 3, 2) + self.conv5 = self._make_layer(Bottleneck, 6, int(64 * width_mult), 4, 2) + self.conv6 = self._make_layer(Bottleneck, 6, int(96 * width_mult), 3, 1) + self.conv7 = self._make_layer(Bottleneck, 6, int(160 * width_mult), 3, 2) + self.conv8 = self._make_layer(Bottleneck, 6, int(320 * width_mult), 1, 1) + self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) + + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(fc_dims, self.feature_dim, dropout_p) + self.classifier = nn.Linear(self.feature_dim, num_classes) + + self._init_params() + + def _make_layer(self, block, t, c, n, s): + # t: expansion factor + # c: output channels + # n: number of blocks + # s: stride for first layer + layers = [] + layers.append(block(self.in_channels, c, t, s)) + self.in_channels = c + for i in range(1, n): + layers.append(block(self.in_channels, c, t)) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + """Constructs fully connected layer. + + Args: + fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed + input_dim (int): input dimension + dropout_p (float): dropout probability, if None, dropout is unused + """ + if fc_dims is None: + self.feature_dim = input_dim + return None + + assert isinstance( + fc_dims, (list, tuple) + ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) + + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + + self.feature_dim = fc_dims[-1] + + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + x = self.conv6(x) + x = self.conv7(x) + x = self.conv8(x) + x = self.conv9(x) + return x + + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + + if self.fc is not None: + v = self.fc(v) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + + +def init_pretrained_weights(model, model_url): + """Initializes model with pretrained weights. + + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) + + +def mobilenetv2_x1_0(num_classes, loss, pretrained=True, **kwargs): + model = MobileNetV2( + num_classes, loss=loss, width_mult=1, fc_dims=None, dropout_p=None, **kwargs + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_0"] + ) + ) + return model + + +def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs): + model = MobileNetV2( + num_classes, loss=loss, width_mult=1.4, fc_dims=None, dropout_p=None, **kwargs + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_4"] + ) + ) + return model +# Copied from boxmot/boxmot/reid/backbones/mobilenetv2.py diff --git a/ethology/reid/backbones/osnet.py b/ethology/reid/backbones/osnet.py new file mode 100644 index 00000000..c07e4e45 --- /dev/null +++ b/ethology/reid/backbones/osnet.py @@ -0,0 +1,345 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +from __future__ import absolute_import, division + +import warnings + +import torch +from torch import nn +from torch.nn import functional as F + +__all__ = ["osnet_x1_0", "osnet_x0_75", "osnet_x0_5", "osnet_x0_25", "osnet_ibn_x1_0"] + +pretrained_urls = { + "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", + "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", + "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", + "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", + "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", +} + +# ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, ChannelGate, OSBlock... + +class ConvLayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + +class Conv1x1(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + +class Conv1x1Linear(nn.Module): + def __init__(self, in_channels, out_channels, stride=1): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) + self.bn = nn.BatchNorm2d(out_channels) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + +class Conv3x3(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + +class LightConv3x3(nn.Module): + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + x = self.relu(x) + return x + +class ChannelGate(nn.Module): + def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU(inplace=True) + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + +class OSBlock(nn.Module): + def __init__(self, in_channels, out_channels, IN=False, bottleneck_reduction=4, **kwargs): + super(OSBlock, self).__init__() + mid_channels = out_channels // bottleneck_reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2a = LightConv3x3(mid_channels, mid_channels) + self.conv2b = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2c = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2d = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = None + if IN: + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2a = self.conv2a(x1) + x2b = self.conv2b(x1) + x2c = self.conv2c(x1) + x2d = self.conv2d(x1) + x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + if self.IN is not None: + out = self.IN(out) + return F.relu(out) + +class OSNet(nn.Module): + def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", IN=False, **kwargs): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1], reduce_spatial_size=True, IN=IN) + self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2], reduce_spatial_size=True) + self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3], reduce_spatial_size=False) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + def _make_layer(self, block, layer, in_channels, out_channels, reduce_spatial_size, IN=False): + layers = [] + layers.append(block(in_channels, out_channels, IN=IN)) + for i in range(1, layer): + layers.append(block(out_channels, out_channels, IN=IN)) + if reduce_spatial_size: + layers.append(nn.Sequential(Conv1x1(out_channels, out_channels), nn.AvgPool2d(2, stride=2))) + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + +def init_pretrained_weights(model, key=""): + import errno + import os + from collections import OrderedDict + import gdown + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), + ) + ) + return torch_home + filename = key + "_imagenet.pth" + # Try ethology/models/ directory first + ethology_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) + models_dir = os.path.join(ethology_root, "models") + os.makedirs(models_dir, exist_ok=True) + local_file = os.path.join(models_dir, filename) + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + os.makedirs(model_dir, exist_ok=True) + cached_file = os.path.join(model_dir, filename) + # Prefer ethology/models/ directory file if present + if os.path.exists(local_file): + print(f"[OSNet] Loading model weights from {local_file}") + cached_file = local_file + elif os.path.exists(cached_file): + print(f"[OSNet] Loading model weights from {cached_file}") + else: + print(f"[OSNet] Downloading model weights to {cached_file}") + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights from "{}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)".format(cached_file) + ) + else: + print( + 'Successfully loaded imagenet pretrained weights from "{}"'.format( + cached_file + ) + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + "due to unmatched keys or layer size: {}".format(discarded_layers) + ) + +def osnet_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x1_0") + return model + +def osnet_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_75") + return model + +def osnet_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_5") + return model + +def osnet_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_25") + return model + +def osnet_ibn_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ibn_x1_0") + return model diff --git a/ethology/reid/backbones/osnet_ain.py b/ethology/reid/backbones/osnet_ain.py new file mode 100644 index 00000000..9e052209 --- /dev/null +++ b/ethology/reid/backbones/osnet_ain.py @@ -0,0 +1,356 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +from __future__ import absolute_import, division + +import warnings + +import torch +from torch import nn +from torch.nn import functional as F + +__all__ = ["osnet_ain_x1_0", "osnet_ain_x0_75", "osnet_ain_x0_5", "osnet_ain_x0_25"] + +pretrained_urls = { + "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", + "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", + "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", + "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", +} + +# ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, LightConvStream, ChannelGate, OSBlock, OSBlockINin, OSNet, init_pretrained_weights, and instantiation functions... + +class ConvLayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + +class Conv1x1(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + +class Conv1x1Linear(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, bn=True): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) + self.bn = None + if bn: + self.bn = nn.BatchNorm2d(out_channels) + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + return x + +class Conv3x3(nn.Module): + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + +class LightConv3x3(nn.Module): + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + return self.relu(x) + +class LightConvStream(nn.Module): + def __init__(self, in_channels, out_channels, depth): + super(LightConvStream, self).__init__() + assert depth >= 1 + layers = [LightConv3x3(in_channels, out_channels)] + for i in range(depth - 1): + layers.append(LightConv3x3(out_channels, out_channels)) + self.layers = nn.Sequential(*layers) + def forward(self, x): + return self.layers(x) + +class ChannelGate(nn.Module): + def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU() + self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU() + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + +class OSBlock(nn.Module): + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlock, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + +class OSBlockINin(nn.Module): + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlockINin, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + x3 = self.IN(x3) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + +class OSNet(nn.Module): + def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", conv1_IN=False, **kwargs): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=conv1_IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1]) + self.pool2 = nn.Sequential(Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2)) + self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2]) + self.pool3 = nn.Sequential(Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2)) + self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3]) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + def _make_layer(self, blocks, layer, in_channels, out_channels): + layers = [] + layers += [blocks[0](in_channels, out_channels)] + for i in range(1, len(blocks)): + layers += [blocks[i](out_channels, out_channels)] + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU()) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.InstanceNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.pool2(x) + x = self.conv3(x) + x = self.pool3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + +def init_pretrained_weights(model, key=""): + import errno + import os + from collections import OrderedDict + import gdown + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), + ) + ) + return torch_home + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + try: + os.makedirs(model_dir) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + filename = key + "_imagenet.pth" + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file): + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights from "{}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)".format(cached_file) + ) + else: + print( + 'Successfully loaded imagenet pretrained weights from "{}"'.format( + cached_file + ) + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + "due to unmatched keys or layer size: {}".format(discarded_layers) + ) + +def osnet_ain_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x1_0") + return model + +def osnet_ain_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_75") + return model + +def osnet_ain_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_5") + return model + +def osnet_ain_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_25") + return model diff --git a/ethology/reid/backbones/resnet.py b/ethology/reid/backbones/resnet.py new file mode 100644 index 00000000..12ca5639 --- /dev/null +++ b/ethology/reid/backbones/resnet.py @@ -0,0 +1,273 @@ +# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +""" +Code source: https://github.com/pytorch/vision +""" +from __future__ import absolute_import, division + +import torch.utils.model_zoo as model_zoo +from torch import nn + +__all__ = [ + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", + "resnext50_32x4d", + "resnext101_32x8d", + "resnet50_fc512", +] + +model_urls = { + "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", + "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", + "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", +} + +# ...existing code for conv3x3, conv1x1, BasicBlock, Bottleneck, ResNet, init_pretrained_weights, and instantiation functions... + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, + ) + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + +class BasicBlock(nn.Module): + expansion = 1 + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError("BasicBlock only supports groups=1 and base_width=64") + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + +class Bottleneck(nn.Module): + expansion = 4 + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.0)) * groups + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv3(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + +class ResNet(nn.Module): + def __init__(self, num_classes, loss, block, layers, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None, last_stride=2, fc_dims=None, dropout_p=None, **kwargs): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + self.loss = loss + self.feature_dim = 512 * block.expansion + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=last_stride, dilate=replace_stride_with_dilation[2]) + self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = self._construct_fc_layer(fc_dims, 512 * block.expansion, dropout_p) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None: + self.feature_dim = input_dim + return None + assert isinstance(fc_dims, (list, tuple)), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + +def init_pretrained_weights(model, model_url): + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size()} + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) + +def resnet18(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[2, 2, 2, 2], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet18"]) + return model + +def resnet34(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet34"]) + return model + +def resnet50(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model + +def resnet101(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet101"]) + return model + +def resnet152(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 8, 36, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet152"]) + return model + +def resnext50_32x4d(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=4, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnext50_32x4d"]) + return model + +def resnext101_32x8d(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=8, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnext101_32x8d"]) + return model + +def resnet50_fc512(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=1, fc_dims=[512], dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model diff --git a/ethology/reid/backends/__init__.py b/ethology/reid/backends/__init__.py new file mode 100644 index 00000000..f032d396 --- /dev/null +++ b/ethology/reid/backends/__init__.py @@ -0,0 +1 @@ +# Backends for ReID inference diff --git a/ethology/reid/backends/base_backend.py b/ethology/reid/backends/base_backend.py new file mode 100644 index 00000000..688ec43a --- /dev/null +++ b/ethology/reid/backends/base_backend.py @@ -0,0 +1,170 @@ + +import os +from abc import abstractmethod +from pathlib import Path +import cv2 +import gdown +import numpy as np +import torch +from filelock import SoftFileLock +from ethology.reid.core.registry import ReIDModelRegistry +# from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER +# from ethology.utils.checks import RequirementsChecker # If needed, implement or set RequirementsChecker + +class BaseModelBackend: + + def __init__(self, weights, device, half): + self.weights = weights[0] if isinstance(weights, list) else weights + if isinstance(self.weights, str): + self.weights = Path(self.weights) + # LOGGER.info(self.weights) + self.device = device + self.half = half + self.model = None + # Support both string and torch.device for device + if hasattr(self.device, 'type'): + self.cuda = torch.cuda.is_available() and self.device.type != "cpu" + else: + self.cuda = torch.cuda.is_available() and self.device != "cpu" + + self.download_model(self.weights) + self.model_name = ReIDModelRegistry.get_model_name(self.weights) + + self.model = ReIDModelRegistry.build_model( + self.model_name, + self.weights, + num_classes=ReIDModelRegistry.get_nr_classes(self.weights), + pretrained=not (self.weights and self.weights.is_file()), + use_gpu=device, + ) + # self.checker = RequirementsChecker() + + self.load_model(self.weights) + + self.mean_array = torch.tensor([0.485, 0.456, 0.406], device=self.device).view(1, 3, 1, 1) + self.std_array = torch.tensor([0.229, 0.224, 0.225], device=self.device).view(1, 3, 1, 1) + if "clip" in self.model_name: + self.mean_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) + self.std_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) + + if "vehicleid" in self.weights.name or "veri" in self.weights.name: + input_shape = (256, 256) + elif "lmbn" in self.model_name: + input_shape = (384, 128) + elif "hacnn" in self.model_name: + input_shape = (160, 64) + else: + input_shape = (256, 128) + self.input_shape = input_shape + + + def get_crops(self, xyxys, img): + h, w = img.shape[:2] + interpolation_method = cv2.INTER_LINEAR + num_crops = len(xyxys) + crops = torch.empty( + (num_crops, 3, *self.input_shape), + dtype=torch.half if self.half else torch.float, + device=self.device, + ) + for i, box in enumerate(xyxys): + x1, y1, x2, y2 = box.round().astype("int") + x1, y1, x2, y2 = max(0, x1), max(0, y1), min(w, x2), min(h, y2) + crop = img[y1:y2, x1:x2] + crop = cv2.resize( + crop, + (self.input_shape[1], self.input_shape[0]), + interpolation=interpolation_method, + ) + crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB) + crop = torch.from_numpy(crop).to( + self.device, dtype=torch.half if self.half else torch.float + ) + crops[i] = torch.permute(crop, (2, 0, 1)) + crops = crops / 255.0 + crops = (crops - self.mean_array) / self.std_array + return crops + + + @torch.no_grad() + def get_features(self, xyxys, img): + if xyxys.size != 0: + crops = self.get_crops(xyxys, img) + crops = self.inference_preprocess(crops) + features = self.forward(crops) + features = self.inference_postprocess(features) + else: + features = np.array([]) + features = features / np.linalg.norm(features, axis=-1, keepdims=True) + return features + + + def warmup(self, imgsz=[(256, 128, 3)]): + if self.device.type != "cpu": + im = np.random.randint(0, 255, *imgsz, dtype=np.uint8) + crops = self.get_crops( + xyxys=np.array([[0, 0, 64, 64], [0, 0, 128, 128]]), img=im + ) + crops = self.inference_preprocess(crops) + self.forward(crops) + + + def to_numpy(self, x): + return x.cpu().numpy() if isinstance(x, torch.Tensor) else x + + + def inference_preprocess(self, x): + if self.half: + if isinstance(x, torch.Tensor): + if x.dtype != torch.float16: + x = x.half() + elif isinstance(x, np.ndarray): + if x.dtype != np.float16: + x = x.astype(np.float16) + if hasattr(self, 'nhwc') and self.nhwc: + if isinstance(x, torch.Tensor): + x = x.permute(0, 2, 3, 1) + elif isinstance(x, np.ndarray): + x = np.transpose(x, (0, 2, 3, 1)) + return x + + + def inference_postprocess(self, features): + if isinstance(features, (list, tuple)): + return ( + self.to_numpy(features[0]) if len(features) == 1 else [self.to_numpy(x) for x in features] + ) + else: + return self.to_numpy(features) + + + @abstractmethod + def forward(self, im_batch): + raise NotImplementedError("This method should be implemented by subclasses.") + + + @abstractmethod + def load_model(self, w): + raise NotImplementedError("This method should be implemented by subclasses.") + + + def download_model(self, w): + if isinstance(w, str): + w = Path(w) + if w.suffix != ".pt": + return + model_url = ReIDModelRegistry.get_model_url(w) + lock = SoftFileLock(str(w) + ".lock", timeout=300) + with lock: + if w.exists() or "openvino" in w.name: + # LOGGER.info(f"[PID {os.getpid()}] Found existing ReID weights at {w}; skipping download.") + return + if model_url: + # LOGGER.info(f"[PID {os.getpid()}] Downloading ReID weights from {model_url} → {w}") + gdown.download(model_url, str(w), quiet=False) + else: + # LOGGER.error( + # f"No URL associated with the chosen ReID weights ({w}).\n" + # f"Choose one of the following:" + # ) + ReIDModelRegistry.show_downloadable_models() diff --git a/ethology/reid/backends/onnx_backend.py b/ethology/reid/backends/onnx_backend.py new file mode 100644 index 00000000..c7c93017 --- /dev/null +++ b/ethology/reid/backends/onnx_backend.py @@ -0,0 +1,31 @@ + +from ethology.reid.backends.base_backend import BaseModelBackend + +class ONNXBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # ONNXRuntime will attempt to use the first provider, and if it fails or is not + # available for some reason, it will fall back to the next provider in the list + if self.device.type == "mps": + # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) + providers = ["MPSExecutionProvider", "CPUExecutionProvider"] + elif self.device.type == "cuda": + # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + else: + # self.checker.check_packages(("onnxruntime==1.18.1",)) + providers = ["CPUExecutionProvider"] + import onnxruntime + self.session = onnxruntime.InferenceSession(str(w), providers=providers) + + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() + features = self.session.run( + [self.session.get_outputs()[0].name], + {self.session.get_inputs()[0].name: im_batch}, + )[0] + return features diff --git a/ethology/reid/backends/openvino_backend.py b/ethology/reid/backends/openvino_backend.py new file mode 100644 index 00000000..f06392bf --- /dev/null +++ b/ethology/reid/backends/openvino_backend.py @@ -0,0 +1,48 @@ +from pathlib import Path + +from ethology.reid.backends.base_backend import BaseModelBackend +# Note: LOGGER can be replaced with print or a local logger if needed + +class OpenVinoBackend(BaseModelBackend): + + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # self.checker.check_packages(("openvino>=2025.2.0",)) + + print(f"Loading {w} for OpenVINO inference...") + try: + # requires openvino-dev: https://pypi.org/project/openvino-dev/ + from openvino import Core, Layout + except ImportError: + print( + f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" + "requires openvino pip package to be installed!\n" + "$ pip install openvino>=2025.2.0\n" + ) + raise + ie = Core() + w = Path(w) + print(w) + if w.suffix == '.bin': + w = w.with_suffix('.xml') + + if not w.is_file(): # if not *.xml + w = next( + Path(w).glob("*.xml") + ) # get *.xml file from *_openvino_model dir + network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) + if network.get_parameters()[0].get_layout().empty: + network.get_parameters()[0].set_layout(Layout("NCWH")) + self.executable_network = ie.compile_model( + network, device_name="CPU" + ) # device_name="MYRIAD" for Intel NCS2 + self.output_layer = next(iter(self.executable_network.outputs)) + + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() # FP32 + features = self.executable_network([im_batch])[self.output_layer] + return features diff --git a/ethology/reid/backends/pytorch_backend.py b/ethology/reid/backends/pytorch_backend.py new file mode 100644 index 00000000..d3dbfa06 --- /dev/null +++ b/ethology/reid/backends/pytorch_backend.py @@ -0,0 +1,20 @@ +from ethology.reid.backends.base_backend import BaseModelBackend +from ethology.reid.core.registry import ReIDModelRegistry + +class PyTorchBackend(BaseModelBackend): + + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # Load a PyTorch model + if w and w.is_file(): + ReIDModelRegistry.load_pretrained_weights(self.model, w) + self.model.to(self.device).eval() + self.model.half() if self.half else self.model.float() + + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/backends/tensorrt_backend.py b/ethology/reid/backends/tensorrt_backend.py new file mode 100644 index 00000000..8dd7d7ee --- /dev/null +++ b/ethology/reid/backends/tensorrt_backend.py @@ -0,0 +1,310 @@ +from collections import OrderedDict, namedtuple + +import numpy as np +import torch + +from ethology.reid.backends.base_backend import BaseModelBackend +# Note: LOGGER can be replaced with print or a local logger if needed + +import os +import sys +import torch +import numpy as np +from collections import namedtuple, OrderedDict + + + +Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) + + +class TensorRTBackend(BaseModelBackend): + def __init__(self, engine_path, device=None): + import hashlib + import requests + self.device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) + self.fp16 = False + self.model_ = None + self.context = None + self.bindings = None + self.binding_addrs = None + self.is_trt10 = False + # Download engine if engine_path is a URL + if engine_path.startswith("http://") or engine_path.startswith("https://"): + # Use a hash of the URL for filename + engine_hash = hashlib.md5(engine_path.encode()).hexdigest() + filename = f"trt_engine_{engine_hash}.engine" + cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") + os.makedirs(cache_dir, exist_ok=True) + cached_file = os.path.join(cache_dir, filename) + if not os.path.exists(cached_file): + print(f"[TensorRT] Downloading engine from {engine_path} to {cached_file}") + with requests.get(engine_path, stream=True) as r: + r.raise_for_status() + with open(cached_file, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + else: + print(f"[TensorRT] Using cached engine at {cached_file}") + self.engine_path = cached_file + else: + self.engine_path = engine_path + self.load_model(self.engine_path) + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + try: + import tensorrt as trt + import pycuda.driver as cuda + import pycuda.autoinit # noqa: F401 + except ImportError: + raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError("CUDA device not available for TensorRT inference.") + + Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f: + with trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + # self.checker.check_packages(("nvidia-tensorrt",)) + try: + import tensorrt as trt # TensorRT library + except ImportError: + raise ImportError("Please install tensorrt to use this backend.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError("CUDA device not available for TensorRT inference.") + + Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f: + with trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features diff --git a/ethology/reid/backends/tflite_backend.py b/ethology/reid/backends/tflite_backend.py new file mode 100644 index 00000000..b0a7b707 --- /dev/null +++ b/ethology/reid/backends/tflite_backend.py @@ -0,0 +1,40 @@ +from pathlib import Path + +import numpy as np +import torch + +from ethology.reid.backends.base_backend import BaseModelBackend +# Note: LOGGER can be replaced with print or a local logger if needed + +class TFLiteBackend(BaseModelBackend): + """ + A class to handle TensorFlow Lite model inference with dynamic batch size support. + """ + def __init__(self, weights: Path, device: str, half: bool): + super().__init__(weights, device, half) + self.nhwc = True + self.half = False + + def load_model(self, w): + # self.checker.check_packages(("tensorflow",)) + print(f"Loading {str(w)} for TensorFlow Lite inference...") + import tensorflow as tf + self.interpreter = tf.lite.Interpreter(model_path=str(w)) + self.interpreter.allocate_tensors() + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + self.current_allocated_batch_size = self.input_details[0]["shape"][0] + + def forward(self, im_batch: torch.Tensor) -> np.ndarray: + im_batch = im_batch.cpu().numpy() + batch_size = im_batch.shape[0] + if batch_size != self.current_allocated_batch_size: + self.interpreter.resize_tensor_input( + self.input_details[0]["index"], [batch_size, 256, 128, 3] + ) + self.interpreter.allocate_tensors() + self.current_allocated_batch_size = batch_size + self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) + self.interpreter.invoke() + features = self.interpreter.get_tensor(self.output_details[0]["index"]) + return features diff --git a/ethology/reid/backends/torchscript_backend.py b/ethology/reid/backends/torchscript_backend.py new file mode 100644 index 00000000..b6602171 --- /dev/null +++ b/ethology/reid/backends/torchscript_backend.py @@ -0,0 +1,20 @@ +import torch + +from ethology.reid.backends.base_backend import BaseModelBackend +# Note: LOGGER can be replaced with print or a local logger if needed + +class TorchscriptBackend(BaseModelBackend): + + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + print(f"Loading {w} for TorchScript inference...") + self.model = torch.jit.load(w) + self.model.half() if self.half else self.model.float() + + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/core/__init__.py b/ethology/reid/core/__init__.py new file mode 100644 index 00000000..9dab06ec --- /dev/null +++ b/ethology/reid/core/__init__.py @@ -0,0 +1 @@ +# Core logic for ReID diff --git a/ethology/reid/core/auto_backend.py b/ethology/reid/core/auto_backend.py new file mode 100644 index 00000000..6f43eba2 --- /dev/null +++ b/ethology/reid/core/auto_backend.py @@ -0,0 +1,74 @@ + +from pathlib import Path +from typing import Tuple, Union +import torch +from ethology.reid.backends.onnx_backend import ONNXBackend +from ethology.reid.backends.openvino_backend import OpenVinoBackend +from ethology.reid.backends.pytorch_backend import PyTorchBackend +try: + from ethology.reid.backends.tensorrt_backend import TensorRTBackend +except ImportError: + class TensorRTBackend: + def __init__(self, *args, **kwargs): + raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH.") +from ethology.reid.backends.tflite_backend import TFLiteBackend +from ethology.reid.backends.torchscript_backend import TorchscriptBackend +# from ethology.reid.core import export_formats # If needed, implement or copy export_formats +# from ethology.utils import WEIGHTS # If needed, implement or set WEIGHTS +# from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER +# from ethology.utils.torch_utils import select_device # If needed, implement or set select_device + +class ReidAutoBackend: + def __init__( + self, + weights: Path, + device: torch.device = torch.device("cpu"), + half: bool = False, + ): + super().__init__() + w = weights[0] if isinstance(weights, list) else weights + ( + self.pt, + self.pth, + self.jit, + self.onnx, + self.xml, + self.engine, + self.tflite, + ) = self.model_type(w) + self.weights = weights + self.device = device # For simplicity, skip select_device for now + self.half = half + self.model = self.get_backend() + + def get_backend(self): + backend_map = { + self.pt or self.pth: PyTorchBackend, + self.jit: TorchscriptBackend, + self.onnx: ONNXBackend, + self.engine: TensorRTBackend, + self.xml: OpenVinoBackend, + self.tflite: TFLiteBackend, + } + for condition, backend_class in backend_map.items(): + if condition: + return backend_class(self.weights, self.device, self.half) + raise RuntimeError("This model framework is not supported yet!") + + def check_suffix(self, file: Path = "osnet_x0_25_msmt17.pt", suffix: Union[str, Tuple[str, ...]] = (".pt",), msg: str = ""): + suffix = [suffix] if isinstance(suffix, str) else list(suffix) + files = [file] if isinstance(file, (str, Path)) else list(file) + for f in files: + file_suffix = Path(f).suffix.lower() + if file_suffix and file_suffix not in suffix: + print(f"File {f} does not have an acceptable suffix. Expected: {suffix}") + + def model_type(self, p: Path) -> Tuple[bool, ...]: + # For demo, just check for .pt + sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] + self.check_suffix(p, sf) + types = [str(Path(p)).endswith(s) for s in sf] + # OpenVINO explicit check + if Path(p).suffix in ['.xml', '.bin']: + types[3] = True + return tuple(types) diff --git a/ethology/reid/core/config.py b/ethology/reid/core/config.py new file mode 100644 index 00000000..926c0cc9 --- /dev/null +++ b/ethology/reid/core/config.py @@ -0,0 +1,16 @@ +MODEL_TYPES = [ + "resnet50", + "resnet101", + "mlfn", + "hacnn", + "mobilenetv2_x1_0", + "mobilenetv2_x1_4", + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", + "osnet_ain_x1_0", + "lmbn_n", + "clip", +] diff --git a/ethology/reid/core/factory.py b/ethology/reid/core/factory.py new file mode 100644 index 00000000..bc8b6ab1 --- /dev/null +++ b/ethology/reid/core/factory.py @@ -0,0 +1,30 @@ + +# Import model constructors from ethology's local backbones +from ethology.reid.backbones.hacnn import HACNN +from ethology.reid.backbones.mlfn import mlfn +from ethology.reid.backbones.mobilenetv2 import mobilenetv2_x1_0, mobilenetv2_x1_4 +from ethology.reid.backbones.osnet import osnet_ibn_x1_0, osnet_x0_5, osnet_x0_25, osnet_x0_75, osnet_x1_0 +from ethology.reid.backbones.osnet_ain import osnet_ain_x0_5, osnet_ain_x0_25, osnet_ain_x0_75, osnet_ain_x1_0 +from ethology.reid.backbones.resnet import resnet50, resnet101 +# from ethology.reid.backbones.lmbn.lmbn_n import LMBN_n # If present +# from ethology.reid.backbones.clip.make_model import make_model # If present + +MODEL_FACTORY = { + "resnet50": resnet50, + "resnet101": resnet101, + "mobilenetv2_x1_0": mobilenetv2_x1_0, + "mobilenetv2_x1_4": mobilenetv2_x1_4, + "hacnn": HACNN, + "mlfn": mlfn, + "osnet_x1_0": osnet_x1_0, + "osnet_x0_75": osnet_x0_75, + "osnet_x0_5": osnet_x0_5, + "osnet_x0_25": osnet_x0_25, + "osnet_ibn_x1_0": osnet_ibn_x1_0, + "osnet_ain_x1_0": osnet_ain_x1_0, + "osnet_ain_x0_75": osnet_ain_x0_75, + "osnet_ain_x0_5": osnet_ain_x0_5, + "osnet_ain_x0_25": osnet_ain_x0_25, + # "lmbn_n": LMBN_n, # Uncomment if implemented + # "clip": make_model, # Uncomment if implemented +} diff --git a/ethology/reid/core/handler.py b/ethology/reid/core/handler.py new file mode 100644 index 00000000..b5e51391 --- /dev/null +++ b/ethology/reid/core/handler.py @@ -0,0 +1,33 @@ +# Main handler for ReID in ethology + +# Thin wrapper to use BoxMOT ReID models in ethology +from pathlib import Path +from typing import Union +import numpy as np + + +# Import ethology's local ReID handler +from ethology.reid.core.reid_handler import ReID as EthologyReID + +class ReIDHandler: + """ + Ethology ReID handler using local models and backends. + """ + def __init__(self, weights: Union[str, Path], device='cpu', half=False): + self.model = EthologyReID(weights=weights, device=device, half=half) + + def extract_features(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """ + Extract feature embeddings for detections in a frame. + Parameters + ---------- + frame : np.ndarray + (H, W, C) BGR image. + dets : np.ndarray + (N, 6) array of detections (x1, y1, x2, y2, conf, cls). + Returns + ------- + np.ndarray + (N, D) feature embeddings. + """ + return self.model(frame, dets) diff --git a/ethology/reid/core/registry.py b/ethology/reid/core/registry.py new file mode 100644 index 00000000..333cff2f --- /dev/null +++ b/ethology/reid/core/registry.py @@ -0,0 +1,71 @@ + +from collections import OrderedDict +import torch +from ethology.reid.core.config import MODEL_TYPES #, NR_CLASSES_DICT, TRAINED_URLS +from ethology.reid.core.factory import MODEL_FACTORY +# from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER + +class ReIDModelRegistry: + """Encapsulates model registration and related utilities.""" + + @staticmethod + def show_downloadable_models(): + # LOGGER.info("Available .pt ReID models for automatic download") + # LOGGER.info(list(TRAINED_URLS.keys())) + pass + + @staticmethod + def get_model_name(model): + for name in MODEL_TYPES: + if name in model.name: + return name + return None + + @staticmethod + def get_model_url(model): + # return TRAINED_URLS.get(model.name, None) + return None + + @staticmethod + def load_pretrained_weights(model, weight_path): + device = "cpu" if not torch.cuda.is_available() else None + checkpoint = torch.load( + weight_path, + map_location=torch.device("cpu") if device == "cpu" else None, + weights_only=False, + encoding='latin1', + ) + state_dict = checkpoint.get("state_dict", checkpoint) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + key = k[7:] if k.startswith("module.") else k + if key in model_dict and model_dict[key].size() == v.size(): + new_state_dict[key] = v + matched_layers.append(key) + else: + discarded_layers.append(key) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + + @staticmethod + def show_available_models(): + # LOGGER.info("Available models:") + # LOGGER.info(list(MODEL_FACTORY.keys())) + pass + + @staticmethod + def get_nr_classes(weights): + # dataset_key = weights.name.split("_")[1] + # return NR_CLASSES_DICT.get(dataset_key, 1) + return 1 + + @staticmethod + def build_model(name, weights, num_classes, loss="softmax", pretrained=True, use_gpu=True): + if name not in MODEL_FACTORY: + available = list(MODEL_FACTORY.keys()) + raise KeyError(f"Unknown model '{name}'. Must be one of {available}") + return MODEL_FACTORY[name]( + num_classes=num_classes, loss=loss, pretrained=pretrained, use_gpu=use_gpu + ) diff --git a/ethology/reid/core/reid_handler.py b/ethology/reid/core/reid_handler.py new file mode 100644 index 00000000..2c72658a --- /dev/null +++ b/ethology/reid/core/reid_handler.py @@ -0,0 +1,28 @@ + +from pathlib import Path +from typing import Union +import numpy as np +from ethology.reid.core.auto_backend import ReidAutoBackend + +class ReID: + def __init__(self, weights: Union[str, Path], device='cpu', half=False): + self.weights = Path(weights) + self.device = device + self.half = half + self.backend = ReidAutoBackend(weights=self.weights, device=device, half=half) + self.model = self.backend.model + + def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """ + Extract features for detections in a frame. + Args: + frame: (H, W, C) BGR image + dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. + Returns: + embs: (N, D) embeddings. + """ + if dets.shape[0] == 0: + return np.empty((0, 0)) + xyxy = dets[:, :4] + embs = self.model.get_features(xyxy, frame) + return embs diff --git a/tests/test_unit/test_reid_handler.py b/tests/test_unit/test_reid_handler.py new file mode 100644 index 00000000..3a5146cf --- /dev/null +++ b/tests/test_unit/test_reid_handler.py @@ -0,0 +1,12 @@ +import numpy as np +from ethology.reid.core.handler import ReIDHandler + +def test_extract_features_shape(): + handler = ReIDHandler(weights='osnet_x0_25_imagenet.pth') + frame = np.random.randint(0, 255, (128, 64, 3), dtype=np.uint8) + dets = np.array([ + [10, 10, 50, 100, 0.9, 1], + [60, 20, 100, 110, 0.8, 2], + ]) + feats = handler.extract_features(frame, dets) + assert feats.shape[0] == dets.shape[0] From dd0c1bd106d1696aafe88da17f5a439c5bf78021 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 13:41:00 +0000 Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ethology/reid/backbones/hacnn.py | 561 ++++++------ ethology/reid/backbones/mlfn.py | 463 +++++----- ethology/reid/backbones/mobilenetv2.py | 456 +++++----- ethology/reid/backbones/osnet.py | 800 ++++++++++------- ethology/reid/backbones/osnet_ain.py | 831 +++++++++++------- ethology/reid/backbones/resnet.py | 652 +++++++++----- ethology/reid/backends/base_backend.py | 48 +- ethology/reid/backends/onnx_backend.py | 55 +- ethology/reid/backends/openvino_backend.py | 83 +- ethology/reid/backends/pytorch_backend.py | 28 +- ethology/reid/backends/tensorrt_backend.py | 688 ++++++++------- ethology/reid/backends/tflite_backend.py | 64 +- ethology/reid/backends/torchscript_backend.py | 25 +- ethology/reid/core/auto_backend.py | 127 +-- ethology/reid/core/config.py | 28 +- ethology/reid/core/factory.py | 56 +- ethology/reid/core/handler.py | 21 +- ethology/reid/core/registry.py | 133 +-- ethology/reid/core/reid_handler.py | 51 +- tests/test_unit/test_reid_handler.py | 14 +- 20 files changed, 3005 insertions(+), 2179 deletions(-) diff --git a/ethology/reid/backbones/hacnn.py b/ethology/reid/backbones/hacnn.py index e48ea2b7..9394ad30 100644 --- a/ethology/reid/backbones/hacnn.py +++ b/ethology/reid/backbones/hacnn.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import torch from torch import nn @@ -10,283 +9,315 @@ class ConvBlock(nn.Module): - def __init__(self, in_c, out_c, k, s=1, p=0): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) - self.bn = nn.BatchNorm2d(out_c) - def forward(self, x): - return F.relu(self.bn(self.conv(x))) + def __init__(self, in_c, out_c, k, s=1, p=0): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) + self.bn = nn.BatchNorm2d(out_c) + + def forward(self, x): + return F.relu(self.bn(self.conv(x))) + class InceptionA(nn.Module): - def __init__(self, in_channels, out_channels): - super(InceptionA, self).__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream3 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream4 = nn.Sequential( - nn.AvgPool2d(3, stride=1, padding=1), - ConvBlock(in_channels, mid_channels, 1), - ) - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - s4 = self.stream4(x) - y = torch.cat([s1, s2, s3, s4], dim=1) - return y + def __init__(self, in_channels, out_channels): + super(InceptionA, self).__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream3 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream4 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1), + ConvBlock(in_channels, mid_channels, 1), + ) + + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + s4 = self.stream4(x) + y = torch.cat([s1, s2, s3, s4], dim=1) + return y + class InceptionB(nn.Module): - def __init__(self, in_channels, out_channels): - super(InceptionB, self).__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream3 = nn.Sequential( - nn.MaxPool2d(3, stride=2, padding=1), - ConvBlock(in_channels, mid_channels * 2, 1), - ) - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - y = torch.cat([s1, s2, s3], dim=1) - return y + def __init__(self, in_channels, out_channels): + super(InceptionB, self).__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream3 = nn.Sequential( + nn.MaxPool2d(3, stride=2, padding=1), + ConvBlock(in_channels, mid_channels * 2, 1), + ) + + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + y = torch.cat([s1, s2, s3], dim=1) + return y + class SpatialAttn(nn.Module): - def __init__(self): - super(SpatialAttn, self).__init__() - self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) - self.conv2 = ConvBlock(1, 1, 1) - def forward(self, x): - x = x.mean(1, keepdim=True) - x = self.conv1(x) - x = F.interpolate( - x, (x.size(2) * 2, x.size(3) * 2), mode="bilinear", align_corners=True - ) - x = self.conv2(x) - return x + def __init__(self): + super(SpatialAttn, self).__init__() + self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) + self.conv2 = ConvBlock(1, 1, 1) + + def forward(self, x): + x = x.mean(1, keepdim=True) + x = self.conv1(x) + x = F.interpolate( + x, + (x.size(2) * 2, x.size(3) * 2), + mode="bilinear", + align_corners=True, + ) + x = self.conv2(x) + return x + class ChannelAttn(nn.Module): - def __init__(self, in_channels, reduction_rate=16): - super(ChannelAttn, self).__init__() - assert in_channels % reduction_rate == 0 - self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) - self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]) - x = self.conv1(x) - x = self.conv2(x) - return x + def __init__(self, in_channels, reduction_rate=16): + super(ChannelAttn, self).__init__() + assert in_channels % reduction_rate == 0 + self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) + self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) + + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]) + x = self.conv1(x) + x = self.conv2(x) + return x + class SoftAttn(nn.Module): - def __init__(self, in_channels): - super(SoftAttn, self).__init__() - self.spatial_attn = SpatialAttn() - self.channel_attn = ChannelAttn(in_channels) - self.conv = ConvBlock(in_channels, in_channels, 1) - def forward(self, x): - y_spatial = self.spatial_attn(x) - y_channel = self.channel_attn(x) - y = y_spatial * y_channel - y = torch.sigmoid(self.conv(y)) - return y + def __init__(self, in_channels): + super(SoftAttn, self).__init__() + self.spatial_attn = SpatialAttn() + self.channel_attn = ChannelAttn(in_channels) + self.conv = ConvBlock(in_channels, in_channels, 1) + + def forward(self, x): + y_spatial = self.spatial_attn(x) + y_channel = self.channel_attn(x) + y = y_spatial * y_channel + y = torch.sigmoid(self.conv(y)) + return y + class HardAttn(nn.Module): - def __init__(self, in_channels): - super(HardAttn, self).__init__() - self.fc = nn.Linear(in_channels, 4 * 2) - self.init_params() - def init_params(self): - self.fc.weight.data.zero_() - self.fc.bias.data.copy_( - torch.tensor([0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float) - ) - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) - theta = torch.tanh(self.fc(x)) - theta = theta.view(-1, 4, 2) - return theta + def __init__(self, in_channels): + super(HardAttn, self).__init__() + self.fc = nn.Linear(in_channels, 4 * 2) + self.init_params() + + def init_params(self): + self.fc.weight.data.zero_() + self.fc.bias.data.copy_( + torch.tensor( + [0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float + ) + ) + + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) + theta = torch.tanh(self.fc(x)) + theta = theta.view(-1, 4, 2) + return theta + class HarmAttn(nn.Module): - def __init__(self, in_channels): - super(HarmAttn, self).__init__() - self.soft_attn = SoftAttn(in_channels) - self.hard_attn = HardAttn(in_channels) - def forward(self, x): - y_soft_attn = self.soft_attn(x) - theta = self.hard_attn(x) - return y_soft_attn, theta + def __init__(self, in_channels): + super(HarmAttn, self).__init__() + self.soft_attn = SoftAttn(in_channels) + self.hard_attn = HardAttn(in_channels) + + def forward(self, x): + y_soft_attn = self.soft_attn(x) + theta = self.hard_attn(x) + return y_soft_attn, theta + class HACNN(nn.Module): - def __init__( - self, - num_classes, - loss="softmax", - nchannels=[128, 256, 384], - feat_dim=512, - learn_region=True, - use_gpu=True, - **kwargs, - ): - super(HACNN, self).__init__() - self.loss = loss - self.learn_region = learn_region - self.use_gpu = use_gpu - self.conv = ConvBlock(3, 32, 3, s=2, p=1) - self.inception1 = nn.Sequential( - InceptionA(32, nchannels[0]), - InceptionB(nchannels[0], nchannels[0]), - ) - self.ha1 = HarmAttn(nchannels[0]) - self.inception2 = nn.Sequential( - InceptionA(nchannels[0], nchannels[1]), - InceptionB(nchannels[1], nchannels[1]), - ) - self.ha2 = HarmAttn(nchannels[1]) - self.inception3 = nn.Sequential( - InceptionA(nchannels[1], nchannels[2]), - InceptionB(nchannels[2], nchannels[2]), - ) - self.ha3 = HarmAttn(nchannels[2]) - self.fc_global = nn.Sequential( - nn.Linear(nchannels[2], feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_global = nn.Linear(feat_dim, num_classes) - if self.learn_region: - self.init_scale_factors() - self.local_conv1 = InceptionB(32, nchannels[0]) - self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) - self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) - self.fc_local = nn.Sequential( - nn.Linear(nchannels[2] * 4, feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_local = nn.Linear(feat_dim, num_classes) - self.feat_dim = feat_dim * 2 - else: - self.feat_dim = feat_dim - def init_scale_factors(self): - self.scale_factors = [] - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - def stn(self, x, theta): - grid = F.affine_grid(theta, x.size()) - x = F.grid_sample(x, grid) - return x - def transform_theta(self, theta_i, region_idx): - scale_factors = self.scale_factors[region_idx] - theta = torch.zeros(theta_i.size(0), 2, 3) - theta[:, :, :2] = scale_factors - theta[:, :, -1] = theta_i - if self.use_gpu: - theta = theta.to(next(self.parameters()).device) - return theta - def forward(self, x): - assert ( - x.size(2) == 160 and x.size(3) == 64 - ), "Input size does not match, expected (160, 64) but got ({}, {})".format( - x.size(2), x.size(3) - ) - x = self.conv(x) - x1 = self.inception1(x) - x1_attn, x1_theta = self.ha1(x1) - x1_out = x1 * x1_attn - if self.learn_region: - x1_local_list = [] - for region_idx in range(4): - x1_theta_i = x1_theta[:, region_idx, :] - x1_theta_i = self.transform_theta(x1_theta_i, region_idx) - x1_trans_i = self.stn(x, x1_theta_i) - x1_trans_i = F.interpolate( - x1_trans_i, (24, 28), mode="bilinear", align_corners=True - ) - x1_local_i = self.local_conv1(x1_trans_i) - x1_local_list.append(x1_local_i) - x2 = self.inception2(x1_out) - x2_attn, x2_theta = self.ha2(x2) - x2_out = x2 * x2_attn - if self.learn_region: - x2_local_list = [] - for region_idx in range(4): - x2_theta_i = x2_theta[:, region_idx, :] - x2_theta_i = self.transform_theta(x2_theta_i, region_idx) - x2_trans_i = self.stn(x1_out, x2_theta_i) - x2_trans_i = F.interpolate( - x2_trans_i, (12, 14), mode="bilinear", align_corners=True - ) - x2_local_i = x2_trans_i + x1_local_list[region_idx] - x2_local_i = self.local_conv2(x2_local_i) - x2_local_list.append(x2_local_i) - x3 = self.inception3(x2_out) - x3_attn, x3_theta = self.ha3(x3) - x3_out = x3 * x3_attn - if self.learn_region: - x3_local_list = [] - for region_idx in range(4): - x3_theta_i = x3_theta[:, region_idx, :] - x3_theta_i = self.transform_theta(x3_theta_i, region_idx) - x3_trans_i = self.stn(x2_out, x3_theta_i) - x3_trans_i = F.interpolate( - x3_trans_i, (6, 7), mode="bilinear", align_corners=True - ) - x3_local_i = x3_trans_i + x2_local_list[region_idx] - x3_local_i = self.local_conv3(x3_local_i) - x3_local_list.append(x3_local_i) - x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( - x3_out.size(0), x3_out.size(1) - ) - x_global = self.fc_global(x_global) - if self.learn_region: - x_local_list = [] - for region_idx in range(4): - x_local_i = x3_local_list[region_idx] - x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( - x_local_i.size(0), -1 - ) - x_local_list.append(x_local_i) - x_local = torch.cat(x_local_list, 1) - x_local = self.fc_local(x_local) - if not self.training: - if self.learn_region: - x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) - x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) - return torch.cat([x_global, x_local], 1) - else: - return x_global - prelogits_global = self.classifier_global(x_global) - if self.learn_region: - prelogits_local = self.classifier_local(x_local) - if self.loss == "softmax": - if self.learn_region: - return (prelogits_global, prelogits_local) - else: - return prelogits_global - elif self.loss == "triplet": - if self.learn_region: - return (prelogits_global, prelogits_local), (x_global, x_local) - else: - return prelogits_global, x_global - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + loss="softmax", + nchannels=[128, 256, 384], + feat_dim=512, + learn_region=True, + use_gpu=True, + **kwargs, + ): + super(HACNN, self).__init__() + self.loss = loss + self.learn_region = learn_region + self.use_gpu = use_gpu + self.conv = ConvBlock(3, 32, 3, s=2, p=1) + self.inception1 = nn.Sequential( + InceptionA(32, nchannels[0]), + InceptionB(nchannels[0], nchannels[0]), + ) + self.ha1 = HarmAttn(nchannels[0]) + self.inception2 = nn.Sequential( + InceptionA(nchannels[0], nchannels[1]), + InceptionB(nchannels[1], nchannels[1]), + ) + self.ha2 = HarmAttn(nchannels[1]) + self.inception3 = nn.Sequential( + InceptionA(nchannels[1], nchannels[2]), + InceptionB(nchannels[2], nchannels[2]), + ) + self.ha3 = HarmAttn(nchannels[2]) + self.fc_global = nn.Sequential( + nn.Linear(nchannels[2], feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_global = nn.Linear(feat_dim, num_classes) + if self.learn_region: + self.init_scale_factors() + self.local_conv1 = InceptionB(32, nchannels[0]) + self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) + self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) + self.fc_local = nn.Sequential( + nn.Linear(nchannels[2] * 4, feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_local = nn.Linear(feat_dim, num_classes) + self.feat_dim = feat_dim * 2 + else: + self.feat_dim = feat_dim + + def init_scale_factors(self): + self.scale_factors = [] + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + + def stn(self, x, theta): + grid = F.affine_grid(theta, x.size()) + x = F.grid_sample(x, grid) + return x + + def transform_theta(self, theta_i, region_idx): + scale_factors = self.scale_factors[region_idx] + theta = torch.zeros(theta_i.size(0), 2, 3) + theta[:, :, :2] = scale_factors + theta[:, :, -1] = theta_i + if self.use_gpu: + theta = theta.to(next(self.parameters()).device) + return theta + + def forward(self, x): + assert x.size(2) == 160 and x.size(3) == 64, ( + f"Input size does not match, expected (160, 64) but got ({x.size(2)}, {x.size(3)})" + ) + x = self.conv(x) + x1 = self.inception1(x) + x1_attn, x1_theta = self.ha1(x1) + x1_out = x1 * x1_attn + if self.learn_region: + x1_local_list = [] + for region_idx in range(4): + x1_theta_i = x1_theta[:, region_idx, :] + x1_theta_i = self.transform_theta(x1_theta_i, region_idx) + x1_trans_i = self.stn(x, x1_theta_i) + x1_trans_i = F.interpolate( + x1_trans_i, (24, 28), mode="bilinear", align_corners=True + ) + x1_local_i = self.local_conv1(x1_trans_i) + x1_local_list.append(x1_local_i) + x2 = self.inception2(x1_out) + x2_attn, x2_theta = self.ha2(x2) + x2_out = x2 * x2_attn + if self.learn_region: + x2_local_list = [] + for region_idx in range(4): + x2_theta_i = x2_theta[:, region_idx, :] + x2_theta_i = self.transform_theta(x2_theta_i, region_idx) + x2_trans_i = self.stn(x1_out, x2_theta_i) + x2_trans_i = F.interpolate( + x2_trans_i, (12, 14), mode="bilinear", align_corners=True + ) + x2_local_i = x2_trans_i + x1_local_list[region_idx] + x2_local_i = self.local_conv2(x2_local_i) + x2_local_list.append(x2_local_i) + x3 = self.inception3(x2_out) + x3_attn, x3_theta = self.ha3(x3) + x3_out = x3 * x3_attn + if self.learn_region: + x3_local_list = [] + for region_idx in range(4): + x3_theta_i = x3_theta[:, region_idx, :] + x3_theta_i = self.transform_theta(x3_theta_i, region_idx) + x3_trans_i = self.stn(x2_out, x3_theta_i) + x3_trans_i = F.interpolate( + x3_trans_i, (6, 7), mode="bilinear", align_corners=True + ) + x3_local_i = x3_trans_i + x2_local_list[region_idx] + x3_local_i = self.local_conv3(x3_local_i) + x3_local_list.append(x3_local_i) + x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( + x3_out.size(0), x3_out.size(1) + ) + x_global = self.fc_global(x_global) + if self.learn_region: + x_local_list = [] + for region_idx in range(4): + x_local_i = x3_local_list[region_idx] + x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( + x_local_i.size(0), -1 + ) + x_local_list.append(x_local_i) + x_local = torch.cat(x_local_list, 1) + x_local = self.fc_local(x_local) + if not self.training: + if self.learn_region: + x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) + x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) + return torch.cat([x_global, x_local], 1) + else: + return x_global + prelogits_global = self.classifier_global(x_global) + if self.learn_region: + prelogits_local = self.classifier_local(x_local) + if self.loss == "softmax": + if self.learn_region: + return (prelogits_global, prelogits_local) + else: + return prelogits_global + elif self.loss == "triplet": + if self.learn_region: + return (prelogits_global, prelogits_local), (x_global, x_local) + else: + return prelogits_global, x_global + else: + raise KeyError(f"Unsupported loss: {self.loss}") diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index bb1e235c..8daad863 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import torch import torch.utils.model_zoo as model_zoo from torch import nn @@ -8,232 +7,266 @@ __all__ = ["mlfn"] model_urls = { - # training epoch = 5, top1 = 51.6 - "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", + # training epoch = 5, top1 = 51.6 + "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", } class MLFNBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, fsm_channels, groups=32): - super(MLFNBlock, self).__init__() - self.groups = groups - mid_channels = out_channels // 2 - - # Factor Modules - self.fm_conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False) - self.fm_bn1 = nn.BatchNorm2d(mid_channels) - self.fm_conv2 = nn.Conv2d( - mid_channels, - mid_channels, - 3, - stride=stride, - padding=1, - bias=False, - groups=self.groups, - ) - self.fm_bn2 = nn.BatchNorm2d(mid_channels) - self.fm_conv3 = nn.Conv2d(mid_channels, out_channels, 1, bias=False) - self.fm_bn3 = nn.BatchNorm2d(out_channels) - - # Factor Selection Module - self.fsm = nn.Sequential( - nn.AdaptiveAvgPool2d(1), - nn.Conv2d(in_channels, fsm_channels[0], 1), - nn.BatchNorm2d(fsm_channels[0]), - nn.ReLU(inplace=True), - nn.Conv2d(fsm_channels[0], fsm_channels[1], 1), - nn.BatchNorm2d(fsm_channels[1]), - nn.ReLU(inplace=True), - nn.Conv2d(fsm_channels[1], self.groups, 1), - nn.BatchNorm2d(self.groups), - nn.Sigmoid(), - ) - - self.downsample = None - if in_channels != out_channels or stride > 1: - self.downsample = nn.Sequential( - nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False), - nn.BatchNorm2d(out_channels), - ) - - def forward(self, x): - residual = x - s = self.fsm(x) - - # reduce dimension - x = self.fm_conv1(x) - x = self.fm_bn1(x) - x = F.relu(x, inplace=True) - - # group convolution - x = self.fm_conv2(x) - x = self.fm_bn2(x) - x = F.relu(x, inplace=True) - - # factor selection - b, c = x.size(0), x.size(1) - n = c // self.groups - ss = s.repeat(1, n, 1, 1) # from (b, g, 1, 1) to (b, g*n=c, 1, 1) - ss = ss.view(b, n, self.groups, 1, 1) - ss = ss.permute(0, 2, 1, 3, 4).contiguous() - ss = ss.view(b, c, 1, 1) - x = ss * x - - # recover dimension - x = self.fm_conv3(x) - x = self.fm_bn3(x) - x = F.relu(x, inplace=True) - - if self.downsample is not None: - residual = self.downsample(residual) - - return F.relu(residual + x, inplace=True), s + def __init__( + self, in_channels, out_channels, stride, fsm_channels, groups=32 + ): + super(MLFNBlock, self).__init__() + self.groups = groups + mid_channels = out_channels // 2 + + # Factor Modules + self.fm_conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False) + self.fm_bn1 = nn.BatchNorm2d(mid_channels) + self.fm_conv2 = nn.Conv2d( + mid_channels, + mid_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=self.groups, + ) + self.fm_bn2 = nn.BatchNorm2d(mid_channels) + self.fm_conv3 = nn.Conv2d(mid_channels, out_channels, 1, bias=False) + self.fm_bn3 = nn.BatchNorm2d(out_channels) + + # Factor Selection Module + self.fsm = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, fsm_channels[0], 1), + nn.BatchNorm2d(fsm_channels[0]), + nn.ReLU(inplace=True), + nn.Conv2d(fsm_channels[0], fsm_channels[1], 1), + nn.BatchNorm2d(fsm_channels[1]), + nn.ReLU(inplace=True), + nn.Conv2d(fsm_channels[1], self.groups, 1), + nn.BatchNorm2d(self.groups), + nn.Sigmoid(), + ) + + self.downsample = None + if in_channels != out_channels or stride > 1: + self.downsample = nn.Sequential( + nn.Conv2d( + in_channels, out_channels, 1, stride=stride, bias=False + ), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + residual = x + s = self.fsm(x) + + # reduce dimension + x = self.fm_conv1(x) + x = self.fm_bn1(x) + x = F.relu(x, inplace=True) + + # group convolution + x = self.fm_conv2(x) + x = self.fm_bn2(x) + x = F.relu(x, inplace=True) + + # factor selection + b, c = x.size(0), x.size(1) + n = c // self.groups + ss = s.repeat(1, n, 1, 1) # from (b, g, 1, 1) to (b, g*n=c, 1, 1) + ss = ss.view(b, n, self.groups, 1, 1) + ss = ss.permute(0, 2, 1, 3, 4).contiguous() + ss = ss.view(b, c, 1, 1) + x = ss * x + + # recover dimension + x = self.fm_conv3(x) + x = self.fm_bn3(x) + x = F.relu(x, inplace=True) + + if self.downsample is not None: + residual = self.downsample(residual) + + return F.relu(residual + x, inplace=True), s class MLFN(nn.Module): - """Multi-Level Factorisation Net. - - Reference: - Chang et al. Multi-Level Factorisation Net for - Person Re-Identification. CVPR 2018. - - Public keys: - - ``mlfn``: MLFN (Multi-Level Factorisation Net). - """ - - def __init__( - self, - num_classes, - loss="softmax", - groups=32, - channels=[64, 256, 512, 1024, 2048], - embed_dim=1024, - **kwargs, - ): - super(MLFN, self).__init__() - self.loss = loss - self.groups = groups - - # first convolutional layer - self.conv1 = nn.Conv2d(3, channels[0], 7, stride=2, padding=3) - self.bn1 = nn.BatchNorm2d(channels[0]) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - - # main body - self.feature = nn.ModuleList( - [ - # layer 1-3 - MLFNBlock(channels[0], channels[1], 1, [128, 64], self.groups), - MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), - MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), - # layer 4-7 - MLFNBlock(channels[1], channels[2], 2, [256, 128], self.groups), - MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), - MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), - MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups), - # layer 8-13 - MLFNBlock(channels[2], channels[3], 2, [512, 128], self.groups), - MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), - MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), - MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), - MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), - MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups), - # layer 14-16 - MLFNBlock(channels[3], channels[4], 2, [512, 128], self.groups), - MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups), - MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups), - ] - ) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - - # projection functions - self.fc_x = nn.Sequential( - nn.Conv2d(channels[4], embed_dim, 1, bias=False), - nn.BatchNorm2d(embed_dim), - nn.ReLU(inplace=True), - ) - self.fc_s = nn.Sequential( - nn.Conv2d(self.groups * 16, embed_dim, 1, bias=False), - nn.BatchNorm2d(embed_dim), - nn.ReLU(inplace=True), - ) - - self.classifier = nn.Linear(embed_dim, num_classes) - - self.init_params() - - def init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = F.relu(x, inplace=True) - x = self.maxpool(x) - - s_hat = [] - for block in self.feature: - x, s = block(x) - s_hat.append(s) - s_hat = torch.cat(s_hat, 1) - - x = self.global_avgpool(x) - x = self.fc_x(x) - s_hat = self.fc_s(s_hat) - - v = (x + s_hat) * 0.5 - v = v.view(v.size(0), -1) - - if not self.training: - return v - - y = self.classifier(v) - - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + """Multi-Level Factorisation Net. + + Reference: + Chang et al. Multi-Level Factorisation Net for + Person Re-Identification. CVPR 2018. + + Public keys: + - ``mlfn``: MLFN (Multi-Level Factorisation Net). + """ + + def __init__( + self, + num_classes, + loss="softmax", + groups=32, + channels=[64, 256, 512, 1024, 2048], + embed_dim=1024, + **kwargs, + ): + super(MLFN, self).__init__() + self.loss = loss + self.groups = groups + + # first convolutional layer + self.conv1 = nn.Conv2d(3, channels[0], 7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(channels[0]) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + + # main body + self.feature = nn.ModuleList( + [ + # layer 1-3 + MLFNBlock(channels[0], channels[1], 1, [128, 64], self.groups), + MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), + MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups), + # layer 4-7 + MLFNBlock( + channels[1], channels[2], 2, [256, 128], self.groups + ), + MLFNBlock( + channels[2], channels[2], 1, [256, 128], self.groups + ), + MLFNBlock( + channels[2], channels[2], 1, [256, 128], self.groups + ), + MLFNBlock( + channels[2], channels[2], 1, [256, 128], self.groups + ), + # layer 8-13 + MLFNBlock( + channels[2], channels[3], 2, [512, 128], self.groups + ), + MLFNBlock( + channels[3], channels[3], 1, [512, 128], self.groups + ), + MLFNBlock( + channels[3], channels[3], 1, [512, 128], self.groups + ), + MLFNBlock( + channels[3], channels[3], 1, [512, 128], self.groups + ), + MLFNBlock( + channels[3], channels[3], 1, [512, 128], self.groups + ), + MLFNBlock( + channels[3], channels[3], 1, [512, 128], self.groups + ), + # layer 14-16 + MLFNBlock( + channels[3], channels[4], 2, [512, 128], self.groups + ), + MLFNBlock( + channels[4], channels[4], 1, [512, 128], self.groups + ), + MLFNBlock( + channels[4], channels[4], 1, [512, 128], self.groups + ), + ] + ) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + + # projection functions + self.fc_x = nn.Sequential( + nn.Conv2d(channels[4], embed_dim, 1, bias=False), + nn.BatchNorm2d(embed_dim), + nn.ReLU(inplace=True), + ) + self.fc_s = nn.Sequential( + nn.Conv2d(self.groups * 16, embed_dim, 1, bias=False), + nn.BatchNorm2d(embed_dim), + nn.ReLU(inplace=True), + ) + + self.classifier = nn.Linear(embed_dim, num_classes) + + self.init_params() + + def init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x, inplace=True) + x = self.maxpool(x) + + s_hat = [] + for block in self.feature: + x, s = block(x) + s_hat.append(s) + s_hat = torch.cat(s_hat, 1) + + x = self.global_avgpool(x) + x = self.fc_x(x) + s_hat = self.fc_s(s_hat) + + v = (x + s_hat) * 0.5 + v = v.view(v.size(0), -1) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") def init_pretrained_weights(model, model_url): - """Initializes model with pretrained weights. + """Initializes model with pretrained weights. - Layers that don't match with pretrained layers in name or size are kept unchanged. - """ - pretrain_dict = model_zoo.load_url(model_url) - model_dict = model.state_dict() - pretrain_dict = { - k: v - for k, v in pretrain_dict.items() - if k in model_dict and model_dict[k].size() == v.size() - } - model_dict.update(pretrain_dict) - model.load_state_dict(model_dict) + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) def mlfn(num_classes, loss="softmax", pretrained=True, **kwargs): - model = MLFN(num_classes, loss, **kwargs) - if pretrained: - # init_pretrained_weights(model, model_urls['imagenet']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["imagenet"] - ) - ) - return model + model = MLFN(num_classes, loss, **kwargs) + if pretrained: + # init_pretrained_weights(model, model_urls['imagenet']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["imagenet"] + ) + ) + return model + + # Copied from boxmot/boxmot/reid/backbones/mlfn.py diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py index 35a16219..b3e69186 100644 --- a/ethology/reid/backbones/mobilenetv2.py +++ b/ethology/reid/backbones/mobilenetv2.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import torch.utils.model_zoo as model_zoo from torch import nn @@ -9,239 +8,272 @@ __all__ = ["mobilenetv2_x1_0", "mobilenetv2_x1_4"] model_urls = { - # 1.0: top-1 71.3 - "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", - # 1.4: top-1 73.9 - "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", + # 1.0: top-1 71.3 + "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", + # 1.4: top-1 73.9 + "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", } class ConvBlock(nn.Module): - """Basic convolutional block. + """Basic convolutional block. - convolution (bias discarded) + batch normalization + relu6. + convolution (bias discarded) + batch normalization + relu6. - Args: - in_c (int): number of input channels. - out_c (int): number of output channels. - k (int or tuple): kernel size. - s (int or tuple): stride. - p (int or tuple): padding. - g (int): number of blocked connections from input channels - to output channels (default: 1). - """ + Args: + in_c (int): number of input channels. + out_c (int): number of output channels. + k (int or tuple): kernel size. + s (int or tuple): stride. + p (int or tuple): padding. + g (int): number of blocked connections from input channels + to output channels (default: 1). - def __init__(self, in_c, out_c, k, s=1, p=0, g=1): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) - self.bn = nn.BatchNorm2d(out_c) + """ - def forward(self, x): - return F.relu6(self.bn(self.conv(x))) + def __init__(self, in_c, out_c, k, s=1, p=0, g=1): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + in_c, out_c, k, stride=s, padding=p, bias=False, groups=g + ) + self.bn = nn.BatchNorm2d(out_c) + + def forward(self, x): + return F.relu6(self.bn(self.conv(x))) class Bottleneck(nn.Module): - def __init__(self, in_channels, out_channels, expansion_factor, stride=1): - super(Bottleneck, self).__init__() - mid_channels = in_channels * expansion_factor - self.use_residual = stride == 1 and in_channels == out_channels - self.conv1 = ConvBlock(in_channels, mid_channels, 1) - self.dwconv2 = ConvBlock( - mid_channels, mid_channels, 3, stride, 1, g=mid_channels - ) - self.conv3 = nn.Sequential( - nn.Conv2d(mid_channels, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels), - ) - - def forward(self, x): - m = self.conv1(x) - m = self.dwconv2(m) - m = self.conv3(m) - if self.use_residual: - return x + m - else: - return m + def __init__(self, in_channels, out_channels, expansion_factor, stride=1): + super(Bottleneck, self).__init__() + mid_channels = in_channels * expansion_factor + self.use_residual = stride == 1 and in_channels == out_channels + self.conv1 = ConvBlock(in_channels, mid_channels, 1) + self.dwconv2 = ConvBlock( + mid_channels, mid_channels, 3, stride, 1, g=mid_channels + ) + self.conv3 = nn.Sequential( + nn.Conv2d(mid_channels, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + m = self.conv1(x) + m = self.dwconv2(m) + m = self.conv3(m) + if self.use_residual: + return x + m + else: + return m class MobileNetV2(nn.Module): - """MobileNetV2. - - Reference: - Sandler et al. MobileNetV2: Inverted Residuals and - Linear Bottlenecks. CVPR 2018. - - Public keys: - - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. - - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. - """ - - def __init__( - self, - num_classes, - width_mult=1, - loss="softmax", - fc_dims=None, - dropout_p=None, - **kwargs, - ): - super(MobileNetV2, self).__init__() - self.loss = loss - self.in_channels = int(32 * width_mult) - self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 - - # construct layers - self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) - self.conv2 = self._make_layer(Bottleneck, 1, int(16 * width_mult), 1, 1) - self.conv3 = self._make_layer(Bottleneck, 6, int(24 * width_mult), 2, 2) - self.conv4 = self._make_layer(Bottleneck, 6, int(32 * width_mult), 3, 2) - self.conv5 = self._make_layer(Bottleneck, 6, int(64 * width_mult), 4, 2) - self.conv6 = self._make_layer(Bottleneck, 6, int(96 * width_mult), 3, 1) - self.conv7 = self._make_layer(Bottleneck, 6, int(160 * width_mult), 3, 2) - self.conv8 = self._make_layer(Bottleneck, 6, int(320 * width_mult), 1, 1) - self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) - - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(fc_dims, self.feature_dim, dropout_p) - self.classifier = nn.Linear(self.feature_dim, num_classes) - - self._init_params() - - def _make_layer(self, block, t, c, n, s): - # t: expansion factor - # c: output channels - # n: number of blocks - # s: stride for first layer - layers = [] - layers.append(block(self.in_channels, c, t, s)) - self.in_channels = c - for i in range(1, n): - layers.append(block(self.in_channels, c, t)) - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - """Constructs fully connected layer. - - Args: - fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed - input_dim (int): input dimension - dropout_p (float): dropout probability, if None, dropout is unused - """ - if fc_dims is None: - self.feature_dim = input_dim - return None - - assert isinstance( - fc_dims, (list, tuple) - ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) - - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - - self.feature_dim = fc_dims[-1] - - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - x = self.conv6(x) - x = self.conv7(x) - x = self.conv8(x) - x = self.conv9(x) - return x - - def forward(self, x): - f = self.featuremaps(x) - v = self.global_avgpool(f) - v = v.view(v.size(0), -1) - - if self.fc is not None: - v = self.fc(v) - - if not self.training: - return v - - y = self.classifier(v) - - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + """MobileNetV2. + + Reference: + Sandler et al. MobileNetV2: Inverted Residuals and + Linear Bottlenecks. CVPR 2018. + + Public keys: + - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. + - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. + """ + + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super(MobileNetV2, self).__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + + # construct layers + self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) + self.conv2 = self._make_layer( + Bottleneck, 1, int(16 * width_mult), 1, 1 + ) + self.conv3 = self._make_layer( + Bottleneck, 6, int(24 * width_mult), 2, 2 + ) + self.conv4 = self._make_layer( + Bottleneck, 6, int(32 * width_mult), 3, 2 + ) + self.conv5 = self._make_layer( + Bottleneck, 6, int(64 * width_mult), 4, 2 + ) + self.conv6 = self._make_layer( + Bottleneck, 6, int(96 * width_mult), 3, 1 + ) + self.conv7 = self._make_layer( + Bottleneck, 6, int(160 * width_mult), 3, 2 + ) + self.conv8 = self._make_layer( + Bottleneck, 6, int(320 * width_mult), 1, 1 + ) + self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) + + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + fc_dims, self.feature_dim, dropout_p + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + + self._init_params() + + def _make_layer(self, block, t, c, n, s): + # t: expansion factor + # c: output channels + # n: number of blocks + # s: stride for first layer + layers = [] + layers.append(block(self.in_channels, c, t, s)) + self.in_channels = c + for i in range(1, n): + layers.append(block(self.in_channels, c, t)) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + """Constructs fully connected layer. + + Args: + fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed + input_dim (int): input dimension + dropout_p (float): dropout probability, if None, dropout is unused + + """ + if fc_dims is None: + self.feature_dim = input_dim + return None + + assert isinstance(fc_dims, (list, tuple)), ( + f"fc_dims must be either list or tuple, but got {type(fc_dims)}" + ) + + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + + self.feature_dim = fc_dims[-1] + + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d) or isinstance( + m, nn.BatchNorm1d + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + x = self.conv6(x) + x = self.conv7(x) + x = self.conv8(x) + x = self.conv9(x) + return x + + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + + if self.fc is not None: + v = self.fc(v) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") def init_pretrained_weights(model, model_url): - """Initializes model with pretrained weights. + """Initializes model with pretrained weights. - Layers that don't match with pretrained layers in name or size are kept unchanged. - """ - pretrain_dict = model_zoo.load_url(model_url) - model_dict = model.state_dict() - pretrain_dict = { - k: v - for k, v in pretrain_dict.items() - if k in model_dict and model_dict[k].size() == v.size() - } - model_dict.update(pretrain_dict) - model.load_state_dict(model_dict) + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) def mobilenetv2_x1_0(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, loss=loss, width_mult=1, fc_dims=None, dropout_p=None, **kwargs - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_0"] - ) - ) - return model + model = MobileNetV2( + num_classes, + loss=loss, + width_mult=1, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_0"] + ) + ) + return model def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, loss=loss, width_mult=1.4, fc_dims=None, dropout_p=None, **kwargs - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_4"] - ) - ) - return model + model = MobileNetV2( + num_classes, + loss=loss, + width_mult=1.4, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_4"] + ) + ) + return model + + # Copied from boxmot/boxmot/reid/backbones/mobilenetv2.py diff --git a/ethology/reid/backbones/osnet.py b/ethology/reid/backbones/osnet.py index c07e4e45..c13dd5b7 100644 --- a/ethology/reid/backbones/osnet.py +++ b/ethology/reid/backbones/osnet.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import warnings @@ -8,338 +7,529 @@ from torch import nn from torch.nn import functional as F -__all__ = ["osnet_x1_0", "osnet_x0_75", "osnet_x0_5", "osnet_x0_25", "osnet_ibn_x1_0"] +__all__ = [ + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", +] pretrained_urls = { - "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", - "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", - "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", - "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", - "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", + "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", + "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", + "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", + "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", + "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, ChannelGate, OSBlock... + class ConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + groups=1, + IN=False, + ): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + groups=groups, + ) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=stride, + padding=0, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) - self.bn = nn.BatchNorm2d(out_channels) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return x + def __init__(self, in_channels, out_channels, stride=1): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) - self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, 1, stride=1, padding=0, bias=False + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=False, + groups=out_channels, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + x = self.relu(x) + return x + class ChannelGate(nn.Module): - def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU(inplace=True) - self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU(inplace=True) - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x + def __init__( + self, + in_channels, + num_gates=None, + return_gates=False, + gate_activation="sigmoid", + reduction=16, + layer_norm=False, + ): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + in_channels, + in_channels // reduction, + kernel_size=1, + bias=True, + padding=0, + ) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d( + in_channels // reduction, + num_gates, + kernel_size=1, + bias=True, + padding=0, + ) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU(inplace=True) + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError(f"Unknown gate activation: {gate_activation}") + + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + class OSBlock(nn.Module): - def __init__(self, in_channels, out_channels, IN=False, bottleneck_reduction=4, **kwargs): - super(OSBlock, self).__init__() - mid_channels = out_channels // bottleneck_reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2a = LightConv3x3(mid_channels, mid_channels) - self.conv2b = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2c = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2d = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = None - if IN: - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2a = self.conv2a(x1) - x2b = self.conv2b(x1) - x2c = self.conv2c(x1) - x2d = self.conv2d(x1) - x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - if self.IN is not None: - out = self.IN(out) - return F.relu(out) + def __init__( + self, + in_channels, + out_channels, + IN=False, + bottleneck_reduction=4, + **kwargs, + ): + super(OSBlock, self).__init__() + mid_channels = out_channels // bottleneck_reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2a = LightConv3x3(mid_channels, mid_channels) + self.conv2b = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2c = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2d = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = None + if IN: + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2a = self.conv2a(x1) + x2b = self.conv2b(x1) + x2c = self.conv2c(x1) + x2d = self.conv2d(x1) + x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + if self.IN is not None: + out = self.IN(out) + return F.relu(out) + class OSNet(nn.Module): - def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", IN=False, **kwargs): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1], reduce_spatial_size=True, IN=IN) - self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2], reduce_spatial_size=True) - self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3], reduce_spatial_size=False) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - def _make_layer(self, block, layer, in_channels, out_channels, reduce_spatial_size, IN=False): - layers = [] - layers.append(block(in_channels, out_channels, IN=IN)) - for i in range(1, layer): - layers.append(block(out_channels, out_channels, IN=IN)) - if reduce_spatial_size: - layers.append(nn.Sequential(Conv1x1(out_channels, out_channels), nn.AvgPool2d(2, stride=2))) - return nn.Sequential(*layers) - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + blocks, + layers, + channels, + feature_dim=512, + loss="softmax", + IN=False, + **kwargs, + ): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer( + blocks[0], + layers[0], + channels[0], + channels[1], + reduce_spatial_size=True, + IN=IN, + ) + self.conv3 = self._make_layer( + blocks[1], + layers[1], + channels[1], + channels[2], + reduce_spatial_size=True, + ) + self.conv4 = self._make_layer( + blocks[2], + layers[2], + channels[2], + channels[3], + reduce_spatial_size=False, + ) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + self.feature_dim, channels[3], dropout_p=None + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + + def _make_layer( + self, + block, + layer, + in_channels, + out_channels, + reduce_spatial_size, + IN=False, + ): + layers = [] + layers.append(block(in_channels, out_channels, IN=IN)) + for i in range(1, layer): + layers.append(block(out_channels, out_channels, IN=IN)) + if reduce_spatial_size: + layers.append( + nn.Sequential( + Conv1x1(out_channels, out_channels), + nn.AvgPool2d(2, stride=2), + ) + ) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d) or isinstance( + m, nn.BatchNorm1d + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") + def init_pretrained_weights(model, key=""): - import errno - import os - from collections import OrderedDict - import gdown - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), - ) - ) - return torch_home - filename = key + "_imagenet.pth" - # Try ethology/models/ directory first - ethology_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) - models_dir = os.path.join(ethology_root, "models") - os.makedirs(models_dir, exist_ok=True) - local_file = os.path.join(models_dir, filename) - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - os.makedirs(model_dir, exist_ok=True) - cached_file = os.path.join(model_dir, filename) - # Prefer ethology/models/ directory file if present - if os.path.exists(local_file): - print(f"[OSNet] Loading model weights from {local_file}") - cached_file = local_file - elif os.path.exists(cached_file): - print(f"[OSNet] Loading model weights from {cached_file}") - else: - print(f"[OSNet] Downloading model weights to {cached_file}") - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights from "{}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)".format(cached_file) - ) - else: - print( - 'Successfully loaded imagenet pretrained weights from "{}"'.format( - cached_file - ) - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - "due to unmatched keys or layer size: {}".format(discarded_layers) - ) + import os + from collections import OrderedDict + + import gdown + + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" + ), + ) + ) + return torch_home + + filename = key + "_imagenet.pth" + # Try ethology/models/ directory first + ethology_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../../") + ) + models_dir = os.path.join(ethology_root, "models") + os.makedirs(models_dir, exist_ok=True) + local_file = os.path.join(models_dir, filename) + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + os.makedirs(model_dir, exist_ok=True) + cached_file = os.path.join(model_dir, filename) + # Prefer ethology/models/ directory file if present + if os.path.exists(local_file): + print(f"[OSNet] Loading model weights from {local_file}") + cached_file = local_file + elif os.path.exists(cached_file): + print(f"[OSNet] Loading model weights from {cached_file}") + else: + print(f"[OSNet] Downloading model weights to {cached_file}") + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + f'The pretrained weights from "{cached_file}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)" + ) + else: + print( + f'Successfully loaded imagenet pretrained weights from "{cached_file}"' + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + f"due to unmatched keys or layer size: {discarded_layers}" + ) + def osnet_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x1_0") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x1_0") + return model + def osnet_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_75") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[48, 192, 288, 384], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_75") + return model + def osnet_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_5") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[32, 128, 192, 256], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_5") + return model + def osnet_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_25") - return model - -def osnet_ibn_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ibn_x1_0") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[16, 64, 96, 128], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_25") + return model + + +def osnet_ibn_x1_0( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ibn_x1_0") + return model diff --git a/ethology/reid/backbones/osnet_ain.py b/ethology/reid/backbones/osnet_ain.py index 9e052209..2ef3da25 100644 --- a/ethology/reid/backbones/osnet_ain.py +++ b/ethology/reid/backbones/osnet_ain.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import warnings @@ -8,349 +7,541 @@ from torch import nn from torch.nn import functional as F -__all__ = ["osnet_ain_x1_0", "osnet_ain_x0_75", "osnet_ain_x0_5", "osnet_ain_x0_25"] +__all__ = [ + "osnet_ain_x1_0", + "osnet_ain_x0_75", + "osnet_ain_x0_5", + "osnet_ain_x0_25", +] pretrained_urls = { - "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", - "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", - "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", - "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", + "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", + "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", + "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", + "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, LightConvStream, ChannelGate, OSBlock, OSBlockINin, OSNet, init_pretrained_weights, and instantiation functions... + class ConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + groups=1, + IN=False, + ): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + groups=groups, + ) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=stride, + padding=0, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, bn=True): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) - self.bn = None - if bn: - self.bn = nn.BatchNorm2d(out_channels) - def forward(self, x): - x = self.conv(x) - if self.bn is not None: - x = self.bn(x) - return x + def __init__(self, in_channels, out_channels, stride=1, bn=True): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=stride, padding=0, bias=False + ) + self.bn = None + if bn: + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + return x + class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) - self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, 1, stride=1, padding=0, bias=False + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=False, + groups=out_channels, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + return self.relu(x) + class LightConvStream(nn.Module): - def __init__(self, in_channels, out_channels, depth): - super(LightConvStream, self).__init__() - assert depth >= 1 - layers = [LightConv3x3(in_channels, out_channels)] - for i in range(depth - 1): - layers.append(LightConv3x3(out_channels, out_channels)) - self.layers = nn.Sequential(*layers) - def forward(self, x): - return self.layers(x) + def __init__(self, in_channels, out_channels, depth): + super(LightConvStream, self).__init__() + assert depth >= 1 + layers = [LightConv3x3(in_channels, out_channels)] + for i in range(depth - 1): + layers.append(LightConv3x3(out_channels, out_channels)) + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + class ChannelGate(nn.Module): - def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU() - self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU() - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x + def __init__( + self, + in_channels, + num_gates=None, + return_gates=False, + gate_activation="sigmoid", + reduction=16, + layer_norm=False, + ): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + in_channels, + in_channels // reduction, + kernel_size=1, + bias=True, + padding=0, + ) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU() + self.fc2 = nn.Conv2d( + in_channels // reduction, + num_gates, + kernel_size=1, + bias=True, + padding=0, + ) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU() + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError(f"Unknown gate activation: {gate_activation}") + + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + class OSBlock(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlock, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlock, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList( + [ + LightConvStream(mid_channels, mid_channels, t) + for t in range(1, T + 1) + ] + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + class OSBlockINin(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlockINin, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - x3 = self.IN(x3) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlockINin, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList( + [ + LightConvStream(mid_channels, mid_channels, t) + for t in range(1, T + 1) + ] + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + x3 = self.IN(x3) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + class OSNet(nn.Module): - def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", conv1_IN=False, **kwargs): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=conv1_IN) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1]) - self.pool2 = nn.Sequential(Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2)) - self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2]) - self.pool3 = nn.Sequential(Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2)) - self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3]) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - def _make_layer(self, blocks, layer, in_channels, out_channels): - layers = [] - layers += [blocks[0](in_channels, out_channels)] - for i in range(1, len(blocks)): - layers += [blocks[i](out_channels, out_channels)] - return nn.Sequential(*layers) - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU()) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.InstanceNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.pool2(x) - x = self.conv3(x) - x = self.pool3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + blocks, + layers, + channels, + feature_dim=512, + loss="softmax", + conv1_IN=False, + **kwargs, + ): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer( + 3, channels[0], 7, stride=2, padding=3, IN=conv1_IN + ) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer( + blocks[0], layers[0], channels[0], channels[1] + ) + self.pool2 = nn.Sequential( + Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2) + ) + self.conv3 = self._make_layer( + blocks[1], layers[1], channels[1], channels[2] + ) + self.pool3 = nn.Sequential( + Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2) + ) + self.conv4 = self._make_layer( + blocks[2], layers[2], channels[2], channels[3] + ) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + self.feature_dim, channels[3], dropout_p=None + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + + def _make_layer(self, blocks, layer, in_channels, out_channels): + layers = [] + layers += [blocks[0](in_channels, out_channels)] + for i in range(1, len(blocks)): + layers += [blocks[i](out_channels, out_channels)] + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU()) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif ( + isinstance(m, nn.BatchNorm2d) + or isinstance(m, nn.BatchNorm1d) + or isinstance(m, nn.InstanceNorm2d) + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.pool2(x) + x = self.conv3(x) + x = self.pool3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") + def init_pretrained_weights(model, key=""): - import errno - import os - from collections import OrderedDict - import gdown - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), - ) - ) - return torch_home - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - try: - os.makedirs(model_dir) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - filename = key + "_imagenet.pth" - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights from "{}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)".format(cached_file) - ) - else: - print( - 'Successfully loaded imagenet pretrained weights from "{}"'.format( - cached_file - ) - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - "due to unmatched keys or layer size: {}".format(discarded_layers) - ) - -def osnet_ain_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x1_0") - return model - -def osnet_ain_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_75") - return model - -def osnet_ain_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_5") - return model - -def osnet_ain_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_25") - return model + import errno + import os + from collections import OrderedDict + + import gdown + + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" + ), + ) + ) + return torch_home + + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + try: + os.makedirs(model_dir) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + filename = key + "_imagenet.pth" + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file): + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + f'The pretrained weights from "{cached_file}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)" + ) + else: + print( + f'Successfully loaded imagenet pretrained weights from "{cached_file}"' + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + f"due to unmatched keys or layer size: {discarded_layers}" + ) + + +def osnet_ain_x1_0( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x1_0") + return model + + +def osnet_ain_x0_75( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[48, 192, 288, 384], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_75") + return model + + +def osnet_ain_x0_5( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[32, 128, 192, 256], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_5") + return model + + +def osnet_ain_x0_25( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[16, 64, 96, 128], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_25") + return model diff --git a/ethology/reid/backbones/resnet.py b/ethology/reid/backbones/resnet.py index 12ca5639..80eda98e 100644 --- a/ethology/reid/backbones/resnet.py +++ b/ethology/reid/backbones/resnet.py @@ -1,273 +1,461 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -""" -Code source: https://github.com/pytorch/vision -""" -from __future__ import absolute_import, division +"""Code source: https://github.com/pytorch/vision""" import torch.utils.model_zoo as model_zoo from torch import nn __all__ = [ - "resnet18", - "resnet34", - "resnet50", - "resnet101", - "resnet152", - "resnext50_32x4d", - "resnext101_32x8d", - "resnet50_fc512", + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", + "resnext50_32x4d", + "resnext101_32x8d", + "resnet50_fc512", ] model_urls = { - "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", - "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", - "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", - "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", - "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", - "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", - "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", + "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", + "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", + "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", } # ...existing code for conv3x3, conv1x1, BasicBlock, Bottleneck, ResNet, init_pretrained_weights, and instantiation functions... + def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): - return nn.Conv2d( - in_planes, - out_planes, - kernel_size=3, - stride=stride, - padding=dilation, - groups=groups, - bias=False, - dilation=dilation, - ) + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, + ) + def conv1x1(in_planes, out_planes, stride=1): - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + return nn.Conv2d( + in_planes, out_planes, kernel_size=1, stride=stride, bias=False + ) + class BasicBlock(nn.Module): - expansion = 1 - def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): - super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - if groups != 1 or base_width != 64: - raise ValueError("BasicBlock only supports groups=1 and base_width=64") - if dilation > 1: - raise NotImplementedError("Dilation > 1 not supported in BasicBlock") - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = norm_layer(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(planes, planes) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - def forward(self, x): - identity = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - if self.downsample is not None: - identity = self.downsample(x) - out += identity - out = self.relu(out) - return out + expansion = 1 + + def __init__( + self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None, + ): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError( + "BasicBlock only supports groups=1 and base_width=64" + ) + if dilation > 1: + raise NotImplementedError( + "Dilation > 1 not supported in BasicBlock" + ) + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + class Bottleneck(nn.Module): - expansion = 4 - def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): - super(Bottleneck, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - width = int(planes * (base_width / 64.0)) * groups - self.conv1 = conv1x1(inplanes, width) - self.bn1 = norm_layer(width) - self.conv2 = conv3x3(width, width, stride, groups, dilation) - self.bn2 = norm_layer(width) - self.conv3 = conv1x1(width, planes * self.expansion) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - def forward(self, x): - identity = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - out = self.conv3(out) - out = self.bn3(out) - if self.downsample is not None: - identity = self.downsample(x) - out += identity - out = self.relu(out) - return out + expansion = 4 + + def __init__( + self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None, + ): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.0)) * groups + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv3(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + class ResNet(nn.Module): - def __init__(self, num_classes, loss, block, layers, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None, last_stride=2, fc_dims=None, dropout_p=None, **kwargs): - super(ResNet, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - self._norm_layer = norm_layer - self.loss = loss - self.feature_dim = 512 * block.expansion - self.inplanes = 64 - self.dilation = 1 - if replace_stride_with_dilation is None: - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError("replace_stride_with_dilation should be None or a 3-element tuple, got {}".format(replace_stride_with_dilation)) - self.groups = groups - self.base_width = width_per_group - self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) - self.bn1 = norm_layer(self.inplanes) - self.relu = nn.ReLU(inplace=True) - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, 64, layers[0]) - self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) - self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) - self.layer4 = self._make_layer(block, 512, layers[3], stride=last_stride, dilate=replace_stride_with_dilation[2]) - self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = self._construct_fc_layer(fc_dims, 512 * block.expansion, dropout_p) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - if zero_init_residual: - for m in self.modules(): - if isinstance(m, Bottleneck): - nn.init.constant_(m.bn3.weight, 0) - elif isinstance(m, BasicBlock): - nn.init.constant_(m.bn2.weight, 0) - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, planes * block.expansion, stride), - norm_layer(planes * block.expansion), - ) - layers = [] - layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) - return nn.Sequential(*layers) - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None: - self.feature_dim = input_dim - return None - assert isinstance(fc_dims, (list, tuple)), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - def featuremaps(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - x = self.maxpool(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - return x - def forward(self, x): - f = self.featuremaps(x) - v = self.global_avgpool(f) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + loss, + block, + layers, + zero_init_residual=False, + groups=1, + width_per_group=64, + replace_stride_with_dilation=None, + norm_layer=None, + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + self.loss = loss + self.feature_dim = 512 * block.expansion + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError( + f"replace_stride_with_dilation should be None or a 3-element tuple, got {replace_stride_with_dilation}" + ) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d( + 3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False + ) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer( + block, + 128, + layers[1], + stride=2, + dilate=replace_stride_with_dilation[0], + ) + self.layer3 = self._make_layer( + block, + 256, + layers[2], + stride=2, + dilate=replace_stride_with_dilation[1], + ) + self.layer4 = self._make_layer( + block, + 512, + layers[3], + stride=last_stride, + dilate=replace_stride_with_dilation[2], + ) + self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = self._construct_fc_layer( + fc_dims, 512 * block.expansion, dropout_p + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride, + downsample, + self.groups, + self.base_width, + previous_dilation, + norm_layer, + ) + ) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + groups=self.groups, + base_width=self.base_width, + dilation=self.dilation, + norm_layer=norm_layer, + ) + ) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None: + self.feature_dim = input_dim + return None + assert isinstance(fc_dims, (list, tuple)), ( + f"fc_dims must be either list or tuple, but got {type(fc_dims)}" + ) + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d) or isinstance( + m, nn.BatchNorm1d + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") + def init_pretrained_weights(model, model_url): - pretrain_dict = model_zoo.load_url(model_url) - model_dict = model.state_dict() - pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size()} - model_dict.update(pretrain_dict) - model.load_state_dict(model_dict) + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) + def resnet18(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[2, 2, 2, 2], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet18"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=BasicBlock, + layers=[2, 2, 2, 2], + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet18"]) + return model + def resnet34(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet34"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=BasicBlock, + layers=[3, 4, 6, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet34"]) + return model + def resnet50(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet50"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 4, 6, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model + def resnet101(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet101"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 4, 23, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet101"]) + return model + def resnet152(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 8, 36, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet152"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 8, 36, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet152"]) + return model + def resnext50_32x4d(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=4, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnext50_32x4d"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 4, 6, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + groups=32, + width_per_group=4, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnext50_32x4d"]) + return model + def resnext101_32x8d(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=8, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnext101_32x8d"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 4, 23, 3], + last_stride=2, + fc_dims=None, + dropout_p=None, + groups=32, + width_per_group=8, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnext101_32x8d"]) + return model + def resnet50_fc512(num_classes, loss="softmax", pretrained=True, **kwargs): - model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=1, fc_dims=[512], dropout_p=None, **kwargs) - if pretrained: - init_pretrained_weights(model, model_urls["resnet50"]) - return model + model = ResNet( + num_classes=num_classes, + loss=loss, + block=Bottleneck, + layers=[3, 4, 6, 3], + last_stride=1, + fc_dims=[512], + dropout_p=None, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model diff --git a/ethology/reid/backends/base_backend.py b/ethology/reid/backends/base_backend.py index 688ec43a..0edb2826 100644 --- a/ethology/reid/backends/base_backend.py +++ b/ethology/reid/backends/base_backend.py @@ -1,18 +1,19 @@ - -import os from abc import abstractmethod from pathlib import Path + import cv2 import gdown import numpy as np import torch from filelock import SoftFileLock + from ethology.reid.core.registry import ReIDModelRegistry + # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.checks import RequirementsChecker # If needed, implement or set RequirementsChecker -class BaseModelBackend: +class BaseModelBackend: def __init__(self, weights, device, half): self.weights = weights[0] if isinstance(weights, list) else weights if isinstance(self.weights, str): @@ -22,7 +23,7 @@ def __init__(self, weights, device, half): self.half = half self.model = None # Support both string and torch.device for device - if hasattr(self.device, 'type'): + if hasattr(self.device, "type"): self.cuda = torch.cuda.is_available() and self.device.type != "cpu" else: self.cuda = torch.cuda.is_available() and self.device != "cpu" @@ -41,11 +42,19 @@ def __init__(self, weights, device, half): self.load_model(self.weights) - self.mean_array = torch.tensor([0.485, 0.456, 0.406], device=self.device).view(1, 3, 1, 1) - self.std_array = torch.tensor([0.229, 0.224, 0.225], device=self.device).view(1, 3, 1, 1) + self.mean_array = torch.tensor( + [0.485, 0.456, 0.406], device=self.device + ).view(1, 3, 1, 1) + self.std_array = torch.tensor( + [0.229, 0.224, 0.225], device=self.device + ).view(1, 3, 1, 1) if "clip" in self.model_name: - self.mean_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) - self.std_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) + self.mean_array = torch.tensor( + [0.5, 0.5, 0.5], device=self.device + ).view(1, 3, 1, 1) + self.std_array = torch.tensor( + [0.5, 0.5, 0.5], device=self.device + ).view(1, 3, 1, 1) if "vehicleid" in self.weights.name or "veri" in self.weights.name: input_shape = (256, 256) @@ -57,7 +66,6 @@ def __init__(self, weights, device, half): input_shape = (256, 128) self.input_shape = input_shape - def get_crops(self, xyxys, img): h, w = img.shape[:2] interpolation_method = cv2.INTER_LINEAR @@ -85,7 +93,6 @@ def get_crops(self, xyxys, img): crops = (crops - self.mean_array) / self.std_array return crops - @torch.no_grad() def get_features(self, xyxys, img): if xyxys.size != 0: @@ -98,7 +105,6 @@ def get_features(self, xyxys, img): features = features / np.linalg.norm(features, axis=-1, keepdims=True) return features - def warmup(self, imgsz=[(256, 128, 3)]): if self.device.type != "cpu": im = np.random.randint(0, 255, *imgsz, dtype=np.uint8) @@ -108,11 +114,9 @@ def warmup(self, imgsz=[(256, 128, 3)]): crops = self.inference_preprocess(crops) self.forward(crops) - def to_numpy(self, x): return x.cpu().numpy() if isinstance(x, torch.Tensor) else x - def inference_preprocess(self, x): if self.half: if isinstance(x, torch.Tensor): @@ -121,32 +125,34 @@ def inference_preprocess(self, x): elif isinstance(x, np.ndarray): if x.dtype != np.float16: x = x.astype(np.float16) - if hasattr(self, 'nhwc') and self.nhwc: + if hasattr(self, "nhwc") and self.nhwc: if isinstance(x, torch.Tensor): x = x.permute(0, 2, 3, 1) elif isinstance(x, np.ndarray): x = np.transpose(x, (0, 2, 3, 1)) return x - def inference_postprocess(self, features): if isinstance(features, (list, tuple)): return ( - self.to_numpy(features[0]) if len(features) == 1 else [self.to_numpy(x) for x in features] + self.to_numpy(features[0]) + if len(features) == 1 + else [self.to_numpy(x) for x in features] ) else: return self.to_numpy(features) - @abstractmethod def forward(self, im_batch): - raise NotImplementedError("This method should be implemented by subclasses.") - + raise NotImplementedError( + "This method should be implemented by subclasses." + ) @abstractmethod def load_model(self, w): - raise NotImplementedError("This method should be implemented by subclasses.") - + raise NotImplementedError( + "This method should be implemented by subclasses." + ) def download_model(self, w): if isinstance(w, str): diff --git a/ethology/reid/backends/onnx_backend.py b/ethology/reid/backends/onnx_backend.py index c7c93017..41aefb7f 100644 --- a/ethology/reid/backends/onnx_backend.py +++ b/ethology/reid/backends/onnx_backend.py @@ -1,31 +1,34 @@ - from ethology.reid.backends.base_backend import BaseModelBackend + class ONNXBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # ONNXRuntime will attempt to use the first provider, and if it fails or is not + # available for some reason, it will fall back to the next provider in the list + if self.device.type == "mps": + # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) + providers = ["MPSExecutionProvider", "CPUExecutionProvider"] + elif self.device.type == "cuda": + # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + else: + # self.checker.check_packages(("onnxruntime==1.18.1",)) + providers = ["CPUExecutionProvider"] + import onnxruntime - def load_model(self, w): - # ONNXRuntime will attempt to use the first provider, and if it fails or is not - # available for some reason, it will fall back to the next provider in the list - if self.device.type == "mps": - # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) - providers = ["MPSExecutionProvider", "CPUExecutionProvider"] - elif self.device.type == "cuda": - # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) - providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] - else: - # self.checker.check_packages(("onnxruntime==1.18.1",)) - providers = ["CPUExecutionProvider"] - import onnxruntime - self.session = onnxruntime.InferenceSession(str(w), providers=providers) + self.session = onnxruntime.InferenceSession( + str(w), providers=providers + ) - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() - features = self.session.run( - [self.session.get_outputs()[0].name], - {self.session.get_inputs()[0].name: im_batch}, - )[0] - return features + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() + features = self.session.run( + [self.session.get_outputs()[0].name], + {self.session.get_inputs()[0].name: im_batch}, + )[0] + return features diff --git a/ethology/reid/backends/openvino_backend.py b/ethology/reid/backends/openvino_backend.py index f06392bf..0c56a06e 100644 --- a/ethology/reid/backends/openvino_backend.py +++ b/ethology/reid/backends/openvino_backend.py @@ -1,48 +1,49 @@ from pathlib import Path from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed + class OpenVinoBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # self.checker.check_packages(("openvino>=2025.2.0",)) + + print(f"Loading {w} for OpenVINO inference...") + try: + # requires openvino-dev: https://pypi.org/project/openvino-dev/ + from openvino import Core, Layout + except ImportError: + print( + f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" + "requires openvino pip package to be installed!\n" + "$ pip install openvino>=2025.2.0\n" + ) + raise + ie = Core() + w = Path(w) + print(w) + if w.suffix == ".bin": + w = w.with_suffix(".xml") + + if not w.is_file(): # if not *.xml + w = next( + Path(w).glob("*.xml") + ) # get *.xml file from *_openvino_model dir + network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) + if network.get_parameters()[0].get_layout().empty: + network.get_parameters()[0].set_layout(Layout("NCWH")) + self.executable_network = ie.compile_model( + network, device_name="CPU" + ) # device_name="MYRIAD" for Intel NCS2 + self.output_layer = next(iter(self.executable_network.outputs)) - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - - def load_model(self, w): - # self.checker.check_packages(("openvino>=2025.2.0",)) - - print(f"Loading {w} for OpenVINO inference...") - try: - # requires openvino-dev: https://pypi.org/project/openvino-dev/ - from openvino import Core, Layout - except ImportError: - print( - f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" - "requires openvino pip package to be installed!\n" - "$ pip install openvino>=2025.2.0\n" - ) - raise - ie = Core() - w = Path(w) - print(w) - if w.suffix == '.bin': - w = w.with_suffix('.xml') - - if not w.is_file(): # if not *.xml - w = next( - Path(w).glob("*.xml") - ) # get *.xml file from *_openvino_model dir - network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) - if network.get_parameters()[0].get_layout().empty: - network.get_parameters()[0].set_layout(Layout("NCWH")) - self.executable_network = ie.compile_model( - network, device_name="CPU" - ) # device_name="MYRIAD" for Intel NCS2 - self.output_layer = next(iter(self.executable_network.outputs)) - - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() # FP32 - features = self.executable_network([im_batch])[self.output_layer] - return features + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() # FP32 + features = self.executable_network([im_batch])[self.output_layer] + return features diff --git a/ethology/reid/backends/pytorch_backend.py b/ethology/reid/backends/pytorch_backend.py index d3dbfa06..2e859cc8 100644 --- a/ethology/reid/backends/pytorch_backend.py +++ b/ethology/reid/backends/pytorch_backend.py @@ -1,20 +1,20 @@ from ethology.reid.backends.base_backend import BaseModelBackend from ethology.reid.core.registry import ReIDModelRegistry -class PyTorchBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half +class PyTorchBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half - def load_model(self, w): - # Load a PyTorch model - if w and w.is_file(): - ReIDModelRegistry.load_pretrained_weights(self.model, w) - self.model.to(self.device).eval() - self.model.half() if self.half else self.model.float() + def load_model(self, w): + # Load a PyTorch model + if w and w.is_file(): + ReIDModelRegistry.load_pretrained_weights(self.model, w) + self.model.to(self.device).eval() + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/backends/tensorrt_backend.py b/ethology/reid/backends/tensorrt_backend.py index 8dd7d7ee..4f6e95b0 100644 --- a/ethology/reid/backends/tensorrt_backend.py +++ b/ethology/reid/backends/tensorrt_backend.py @@ -1,310 +1,400 @@ +# Note: LOGGER can be replaced with print or a local logger if needed +import os from collections import OrderedDict, namedtuple import numpy as np import torch from ethology.reid.backends.base_backend import BaseModelBackend -# Note: LOGGER can be replaced with print or a local logger if needed - -import os -import sys -import torch -import numpy as np -from collections import namedtuple, OrderedDict - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) class TensorRTBackend(BaseModelBackend): - def __init__(self, engine_path, device=None): - import hashlib - import requests - self.device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) - self.fp16 = False - self.model_ = None - self.context = None - self.bindings = None - self.binding_addrs = None - self.is_trt10 = False - # Download engine if engine_path is a URL - if engine_path.startswith("http://") or engine_path.startswith("https://"): - # Use a hash of the URL for filename - engine_hash = hashlib.md5(engine_path.encode()).hexdigest() - filename = f"trt_engine_{engine_hash}.engine" - cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") - os.makedirs(cache_dir, exist_ok=True) - cached_file = os.path.join(cache_dir, filename) - if not os.path.exists(cached_file): - print(f"[TensorRT] Downloading engine from {engine_path} to {cached_file}") - with requests.get(engine_path, stream=True) as r: - r.raise_for_status() - with open(cached_file, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - else: - print(f"[TensorRT] Using cached engine at {cached_file}") - self.engine_path = cached_file - else: - self.engine_path = engine_path - self.load_model(self.engine_path) - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - try: - import tensorrt as trt - import pycuda.driver as cuda - import pycuda.autoinit # noqa: F401 - except ImportError: - raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH.") - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError("CUDA device not available for TensorRT inference.") - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f: - with trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - # self.checker.check_packages(("nvidia-tensorrt",)) - try: - import tensorrt as trt # TensorRT library - except ImportError: - raise ImportError("Please install tensorrt to use this backend.") - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError("CUDA device not available for TensorRT inference.") - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f: - with trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features + def __init__(self, engine_path, device=None): + import hashlib + + import requests + + self.device = device or ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + self.fp16 = False + self.model_ = None + self.context = None + self.bindings = None + self.binding_addrs = None + self.is_trt10 = False + # Download engine if engine_path is a URL + if engine_path.startswith("http://") or engine_path.startswith( + "https://" + ): + # Use a hash of the URL for filename + engine_hash = hashlib.md5(engine_path.encode()).hexdigest() + filename = f"trt_engine_{engine_hash}.engine" + cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") + os.makedirs(cache_dir, exist_ok=True) + cached_file = os.path.join(cache_dir, filename) + if not os.path.exists(cached_file): + print( + f"[TensorRT] Downloading engine from {engine_path} to {cached_file}" + ) + with requests.get(engine_path, stream=True) as r: + r.raise_for_status() + with open(cached_file, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + else: + print(f"[TensorRT] Using cached engine at {cached_file}") + self.engine_path = cached_file + else: + self.engine_path = engine_path + self.load_model(self.engine_path) + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + try: + import pycuda.autoinit # noqa: F401 + import pycuda.driver as cuda + import tensorrt as trt + except ImportError: + raise ImportError( + "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH." + ) + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError( + "CUDA device not available for TensorRT inference." + ) + + Binding = namedtuple( + "Binding", ("name", "dtype", "shape", "data", "ptr") + ) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f, trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + self.bindings["output"].data.resize_( + tuple(self.context.get_tensor_shape("output")) + ) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, ( + f"Input size {temp_batch.shape} does not match model size {s}" + ) + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + # self.checker.check_packages(("nvidia-tensorrt",)) + try: + import tensorrt as trt # TensorRT library + except ImportError: + raise ImportError("Please install tensorrt to use this backend.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError( + "CUDA device not available for TensorRT inference." + ) + + Binding = namedtuple( + "Binding", ("name", "dtype", "shape", "data", "ptr") + ) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f, trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + self.bindings["output"].data.resize_( + tuple(self.context.get_tensor_shape("output")) + ) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, ( + f"Input size {temp_batch.shape} does not match model size {s}" + ) + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features diff --git a/ethology/reid/backends/tflite_backend.py b/ethology/reid/backends/tflite_backend.py index b0a7b707..eb10d4e8 100644 --- a/ethology/reid/backends/tflite_backend.py +++ b/ethology/reid/backends/tflite_backend.py @@ -4,37 +4,39 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed + class TFLiteBackend(BaseModelBackend): - """ - A class to handle TensorFlow Lite model inference with dynamic batch size support. - """ - def __init__(self, weights: Path, device: str, half: bool): - super().__init__(weights, device, half) - self.nhwc = True - self.half = False - - def load_model(self, w): - # self.checker.check_packages(("tensorflow",)) - print(f"Loading {str(w)} for TensorFlow Lite inference...") - import tensorflow as tf - self.interpreter = tf.lite.Interpreter(model_path=str(w)) - self.interpreter.allocate_tensors() - self.input_details = self.interpreter.get_input_details() - self.output_details = self.interpreter.get_output_details() - self.current_allocated_batch_size = self.input_details[0]["shape"][0] - - def forward(self, im_batch: torch.Tensor) -> np.ndarray: - im_batch = im_batch.cpu().numpy() - batch_size = im_batch.shape[0] - if batch_size != self.current_allocated_batch_size: - self.interpreter.resize_tensor_input( - self.input_details[0]["index"], [batch_size, 256, 128, 3] - ) - self.interpreter.allocate_tensors() - self.current_allocated_batch_size = batch_size - self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) - self.interpreter.invoke() - features = self.interpreter.get_tensor(self.output_details[0]["index"]) - return features + """A class to handle TensorFlow Lite model inference with dynamic batch size support.""" + + def __init__(self, weights: Path, device: str, half: bool): + super().__init__(weights, device, half) + self.nhwc = True + self.half = False + + def load_model(self, w): + # self.checker.check_packages(("tensorflow",)) + print(f"Loading {str(w)} for TensorFlow Lite inference...") + import tensorflow as tf + + self.interpreter = tf.lite.Interpreter(model_path=str(w)) + self.interpreter.allocate_tensors() + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + self.current_allocated_batch_size = self.input_details[0]["shape"][0] + + def forward(self, im_batch: torch.Tensor) -> np.ndarray: + im_batch = im_batch.cpu().numpy() + batch_size = im_batch.shape[0] + if batch_size != self.current_allocated_batch_size: + self.interpreter.resize_tensor_input( + self.input_details[0]["index"], [batch_size, 256, 128, 3] + ) + self.interpreter.allocate_tensors() + self.current_allocated_batch_size = batch_size + self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) + self.interpreter.invoke() + features = self.interpreter.get_tensor(self.output_details[0]["index"]) + return features diff --git a/ethology/reid/backends/torchscript_backend.py b/ethology/reid/backends/torchscript_backend.py index b6602171..1142fcc4 100644 --- a/ethology/reid/backends/torchscript_backend.py +++ b/ethology/reid/backends/torchscript_backend.py @@ -1,20 +1,21 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed -class TorchscriptBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half +class TorchscriptBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half - def load_model(self, w): - print(f"Loading {w} for TorchScript inference...") - self.model = torch.jit.load(w) - self.model.half() if self.half else self.model.float() + def load_model(self, w): + print(f"Loading {w} for TorchScript inference...") + self.model = torch.jit.load(w) + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/core/auto_backend.py b/ethology/reid/core/auto_backend.py index 6f43eba2..22f2c4e2 100644 --- a/ethology/reid/core/auto_backend.py +++ b/ethology/reid/core/auto_backend.py @@ -1,74 +1,89 @@ - from pathlib import Path -from typing import Tuple, Union + import torch + from ethology.reid.backends.onnx_backend import ONNXBackend from ethology.reid.backends.openvino_backend import OpenVinoBackend from ethology.reid.backends.pytorch_backend import PyTorchBackend + try: - from ethology.reid.backends.tensorrt_backend import TensorRTBackend + from ethology.reid.backends.tensorrt_backend import TensorRTBackend except ImportError: - class TensorRTBackend: - def __init__(self, *args, **kwargs): - raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH.") + + class TensorRTBackend: + def __init__(self, *args, **kwargs): + raise ImportError( + "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH." + ) + + from ethology.reid.backends.tflite_backend import TFLiteBackend from ethology.reid.backends.torchscript_backend import TorchscriptBackend + # from ethology.reid.core import export_formats # If needed, implement or copy export_formats # from ethology.utils import WEIGHTS # If needed, implement or set WEIGHTS # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.torch_utils import select_device # If needed, implement or set select_device + class ReidAutoBackend: - def __init__( - self, - weights: Path, - device: torch.device = torch.device("cpu"), - half: bool = False, - ): - super().__init__() - w = weights[0] if isinstance(weights, list) else weights - ( - self.pt, - self.pth, - self.jit, - self.onnx, - self.xml, - self.engine, - self.tflite, - ) = self.model_type(w) - self.weights = weights - self.device = device # For simplicity, skip select_device for now - self.half = half - self.model = self.get_backend() + def __init__( + self, + weights: Path, + device: torch.device = torch.device("cpu"), + half: bool = False, + ): + super().__init__() + w = weights[0] if isinstance(weights, list) else weights + ( + self.pt, + self.pth, + self.jit, + self.onnx, + self.xml, + self.engine, + self.tflite, + ) = self.model_type(w) + self.weights = weights + self.device = device # For simplicity, skip select_device for now + self.half = half + self.model = self.get_backend() - def get_backend(self): - backend_map = { - self.pt or self.pth: PyTorchBackend, - self.jit: TorchscriptBackend, - self.onnx: ONNXBackend, - self.engine: TensorRTBackend, - self.xml: OpenVinoBackend, - self.tflite: TFLiteBackend, - } - for condition, backend_class in backend_map.items(): - if condition: - return backend_class(self.weights, self.device, self.half) - raise RuntimeError("This model framework is not supported yet!") + def get_backend(self): + backend_map = { + self.pt or self.pth: PyTorchBackend, + self.jit: TorchscriptBackend, + self.onnx: ONNXBackend, + self.engine: TensorRTBackend, + self.xml: OpenVinoBackend, + self.tflite: TFLiteBackend, + } + for condition, backend_class in backend_map.items(): + if condition: + return backend_class(self.weights, self.device, self.half) + raise RuntimeError("This model framework is not supported yet!") - def check_suffix(self, file: Path = "osnet_x0_25_msmt17.pt", suffix: Union[str, Tuple[str, ...]] = (".pt",), msg: str = ""): - suffix = [suffix] if isinstance(suffix, str) else list(suffix) - files = [file] if isinstance(file, (str, Path)) else list(file) - for f in files: - file_suffix = Path(f).suffix.lower() - if file_suffix and file_suffix not in suffix: - print(f"File {f} does not have an acceptable suffix. Expected: {suffix}") + def check_suffix( + self, + file: Path = "osnet_x0_25_msmt17.pt", + suffix: str | tuple[str, ...] = (".pt",), + msg: str = "", + ): + suffix = [suffix] if isinstance(suffix, str) else list(suffix) + files = [file] if isinstance(file, (str, Path)) else list(file) + for f in files: + file_suffix = Path(f).suffix.lower() + if file_suffix and file_suffix not in suffix: + print( + f"File {f} does not have an acceptable suffix. Expected: {suffix}" + ) - def model_type(self, p: Path) -> Tuple[bool, ...]: - # For demo, just check for .pt - sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] - self.check_suffix(p, sf) - types = [str(Path(p)).endswith(s) for s in sf] - # OpenVINO explicit check - if Path(p).suffix in ['.xml', '.bin']: - types[3] = True - return tuple(types) + def model_type(self, p: Path) -> tuple[bool, ...]: + # For demo, just check for .pt + sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] + self.check_suffix(p, sf) + types = [str(Path(p)).endswith(s) for s in sf] + # OpenVINO explicit check + if Path(p).suffix in [".xml", ".bin"]: + types[3] = True + return tuple(types) diff --git a/ethology/reid/core/config.py b/ethology/reid/core/config.py index 926c0cc9..dc17cc14 100644 --- a/ethology/reid/core/config.py +++ b/ethology/reid/core/config.py @@ -1,16 +1,16 @@ MODEL_TYPES = [ - "resnet50", - "resnet101", - "mlfn", - "hacnn", - "mobilenetv2_x1_0", - "mobilenetv2_x1_4", - "osnet_x1_0", - "osnet_x0_75", - "osnet_x0_5", - "osnet_x0_25", - "osnet_ibn_x1_0", - "osnet_ain_x1_0", - "lmbn_n", - "clip", + "resnet50", + "resnet101", + "mlfn", + "hacnn", + "mobilenetv2_x1_0", + "mobilenetv2_x1_4", + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", + "osnet_ain_x1_0", + "lmbn_n", + "clip", ] diff --git a/ethology/reid/core/factory.py b/ethology/reid/core/factory.py index bc8b6ab1..27406383 100644 --- a/ethology/reid/core/factory.py +++ b/ethology/reid/core/factory.py @@ -1,30 +1,44 @@ - # Import model constructors from ethology's local backbones from ethology.reid.backbones.hacnn import HACNN from ethology.reid.backbones.mlfn import mlfn -from ethology.reid.backbones.mobilenetv2 import mobilenetv2_x1_0, mobilenetv2_x1_4 -from ethology.reid.backbones.osnet import osnet_ibn_x1_0, osnet_x0_5, osnet_x0_25, osnet_x0_75, osnet_x1_0 -from ethology.reid.backbones.osnet_ain import osnet_ain_x0_5, osnet_ain_x0_25, osnet_ain_x0_75, osnet_ain_x1_0 +from ethology.reid.backbones.mobilenetv2 import ( + mobilenetv2_x1_0, + mobilenetv2_x1_4, +) +from ethology.reid.backbones.osnet import ( + osnet_ibn_x1_0, + osnet_x0_5, + osnet_x0_25, + osnet_x0_75, + osnet_x1_0, +) +from ethology.reid.backbones.osnet_ain import ( + osnet_ain_x0_5, + osnet_ain_x0_25, + osnet_ain_x0_75, + osnet_ain_x1_0, +) from ethology.reid.backbones.resnet import resnet50, resnet101 + # from ethology.reid.backbones.lmbn.lmbn_n import LMBN_n # If present # from ethology.reid.backbones.clip.make_model import make_model # If present MODEL_FACTORY = { - "resnet50": resnet50, - "resnet101": resnet101, - "mobilenetv2_x1_0": mobilenetv2_x1_0, - "mobilenetv2_x1_4": mobilenetv2_x1_4, - "hacnn": HACNN, - "mlfn": mlfn, - "osnet_x1_0": osnet_x1_0, - "osnet_x0_75": osnet_x0_75, - "osnet_x0_5": osnet_x0_5, - "osnet_x0_25": osnet_x0_25, - "osnet_ibn_x1_0": osnet_ibn_x1_0, - "osnet_ain_x1_0": osnet_ain_x1_0, - "osnet_ain_x0_75": osnet_ain_x0_75, - "osnet_ain_x0_5": osnet_ain_x0_5, - "osnet_ain_x0_25": osnet_ain_x0_25, - # "lmbn_n": LMBN_n, # Uncomment if implemented - # "clip": make_model, # Uncomment if implemented + "resnet50": resnet50, + "resnet101": resnet101, + "mobilenetv2_x1_0": mobilenetv2_x1_0, + "mobilenetv2_x1_4": mobilenetv2_x1_4, + "hacnn": HACNN, + "mlfn": mlfn, + "osnet_x1_0": osnet_x1_0, + "osnet_x0_75": osnet_x0_75, + "osnet_x0_5": osnet_x0_5, + "osnet_x0_25": osnet_x0_25, + "osnet_ibn_x1_0": osnet_ibn_x1_0, + "osnet_ain_x1_0": osnet_ain_x1_0, + "osnet_ain_x0_75": osnet_ain_x0_75, + "osnet_ain_x0_5": osnet_ain_x0_5, + "osnet_ain_x0_25": osnet_ain_x0_25, + # "lmbn_n": LMBN_n, # Uncomment if implemented + # "clip": make_model, # Uncomment if implemented } diff --git a/ethology/reid/core/handler.py b/ethology/reid/core/handler.py index b5e51391..ba521ab2 100644 --- a/ethology/reid/core/handler.py +++ b/ethology/reid/core/handler.py @@ -2,32 +2,35 @@ # Thin wrapper to use BoxMOT ReID models in ethology from pathlib import Path -from typing import Union -import numpy as np +import numpy as np # Import ethology's local ReID handler from ethology.reid.core.reid_handler import ReID as EthologyReID + class ReIDHandler: - """ - Ethology ReID handler using local models and backends. - """ - def __init__(self, weights: Union[str, Path], device='cpu', half=False): + """Ethology ReID handler using local models and backends.""" + + def __init__(self, weights: str | Path, device="cpu", half=False): self.model = EthologyReID(weights=weights, device=device, half=half) - def extract_features(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: - """ - Extract feature embeddings for detections in a frame. + def extract_features( + self, frame: np.ndarray, dets: np.ndarray + ) -> np.ndarray: + """Extract feature embeddings for detections in a frame. + Parameters ---------- frame : np.ndarray (H, W, C) BGR image. dets : np.ndarray (N, 6) array of detections (x1, y1, x2, y2, conf, cls). + Returns ------- np.ndarray (N, D) feature embeddings. + """ return self.model(frame, dets) diff --git a/ethology/reid/core/registry.py b/ethology/reid/core/registry.py index 333cff2f..4b9c27fd 100644 --- a/ethology/reid/core/registry.py +++ b/ethology/reid/core/registry.py @@ -1,71 +1,88 @@ - from collections import OrderedDict + import torch -from ethology.reid.core.config import MODEL_TYPES #, NR_CLASSES_DICT, TRAINED_URLS + +from ethology.reid.core.config import ( + MODEL_TYPES, # , NR_CLASSES_DICT, TRAINED_URLS +) from ethology.reid.core.factory import MODEL_FACTORY + # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER + class ReIDModelRegistry: - """Encapsulates model registration and related utilities.""" + """Encapsulates model registration and related utilities.""" - @staticmethod - def show_downloadable_models(): - # LOGGER.info("Available .pt ReID models for automatic download") - # LOGGER.info(list(TRAINED_URLS.keys())) - pass + @staticmethod + def show_downloadable_models(): + # LOGGER.info("Available .pt ReID models for automatic download") + # LOGGER.info(list(TRAINED_URLS.keys())) + pass - @staticmethod - def get_model_name(model): - for name in MODEL_TYPES: - if name in model.name: - return name - return None + @staticmethod + def get_model_name(model): + for name in MODEL_TYPES: + if name in model.name: + return name + return None - @staticmethod - def get_model_url(model): - # return TRAINED_URLS.get(model.name, None) - return None + @staticmethod + def get_model_url(model): + # return TRAINED_URLS.get(model.name, None) + return None - @staticmethod - def load_pretrained_weights(model, weight_path): - device = "cpu" if not torch.cuda.is_available() else None - checkpoint = torch.load( - weight_path, - map_location=torch.device("cpu") if device == "cpu" else None, - weights_only=False, - encoding='latin1', - ) - state_dict = checkpoint.get("state_dict", checkpoint) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - key = k[7:] if k.startswith("module.") else k - if key in model_dict and model_dict[key].size() == v.size(): - new_state_dict[key] = v - matched_layers.append(key) - else: - discarded_layers.append(key) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) + @staticmethod + def load_pretrained_weights(model, weight_path): + device = "cpu" if not torch.cuda.is_available() else None + checkpoint = torch.load( + weight_path, + map_location=torch.device("cpu") if device == "cpu" else None, + weights_only=False, + encoding="latin1", + ) + state_dict = checkpoint.get("state_dict", checkpoint) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + key = k[7:] if k.startswith("module.") else k + if key in model_dict and model_dict[key].size() == v.size(): + new_state_dict[key] = v + matched_layers.append(key) + else: + discarded_layers.append(key) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) - @staticmethod - def show_available_models(): - # LOGGER.info("Available models:") - # LOGGER.info(list(MODEL_FACTORY.keys())) - pass + @staticmethod + def show_available_models(): + # LOGGER.info("Available models:") + # LOGGER.info(list(MODEL_FACTORY.keys())) + pass - @staticmethod - def get_nr_classes(weights): - # dataset_key = weights.name.split("_")[1] - # return NR_CLASSES_DICT.get(dataset_key, 1) - return 1 + @staticmethod + def get_nr_classes(weights): + # dataset_key = weights.name.split("_")[1] + # return NR_CLASSES_DICT.get(dataset_key, 1) + return 1 - @staticmethod - def build_model(name, weights, num_classes, loss="softmax", pretrained=True, use_gpu=True): - if name not in MODEL_FACTORY: - available = list(MODEL_FACTORY.keys()) - raise KeyError(f"Unknown model '{name}'. Must be one of {available}") - return MODEL_FACTORY[name]( - num_classes=num_classes, loss=loss, pretrained=pretrained, use_gpu=use_gpu - ) + @staticmethod + def build_model( + name, + weights, + num_classes, + loss="softmax", + pretrained=True, + use_gpu=True, + ): + if name not in MODEL_FACTORY: + available = list(MODEL_FACTORY.keys()) + raise KeyError( + f"Unknown model '{name}'. Must be one of {available}" + ) + return MODEL_FACTORY[name]( + num_classes=num_classes, + loss=loss, + pretrained=pretrained, + use_gpu=use_gpu, + ) diff --git a/ethology/reid/core/reid_handler.py b/ethology/reid/core/reid_handler.py index 2c72658a..62d42209 100644 --- a/ethology/reid/core/reid_handler.py +++ b/ethology/reid/core/reid_handler.py @@ -1,28 +1,33 @@ - from pathlib import Path -from typing import Union + import numpy as np + from ethology.reid.core.auto_backend import ReidAutoBackend + class ReID: - def __init__(self, weights: Union[str, Path], device='cpu', half=False): - self.weights = Path(weights) - self.device = device - self.half = half - self.backend = ReidAutoBackend(weights=self.weights, device=device, half=half) - self.model = self.backend.model - - def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: - """ - Extract features for detections in a frame. - Args: - frame: (H, W, C) BGR image - dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. - Returns: - embs: (N, D) embeddings. - """ - if dets.shape[0] == 0: - return np.empty((0, 0)) - xyxy = dets[:, :4] - embs = self.model.get_features(xyxy, frame) - return embs + def __init__(self, weights: str | Path, device="cpu", half=False): + self.weights = Path(weights) + self.device = device + self.half = half + self.backend = ReidAutoBackend( + weights=self.weights, device=device, half=half + ) + self.model = self.backend.model + + def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """Extract features for detections in a frame. + + Args: + frame: (H, W, C) BGR image + dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. + + Returns: + embs: (N, D) embeddings. + + """ + if dets.shape[0] == 0: + return np.empty((0, 0)) + xyxy = dets[:, :4] + embs = self.model.get_features(xyxy, frame) + return embs diff --git a/tests/test_unit/test_reid_handler.py b/tests/test_unit/test_reid_handler.py index 3a5146cf..bc8a199c 100644 --- a/tests/test_unit/test_reid_handler.py +++ b/tests/test_unit/test_reid_handler.py @@ -1,12 +1,16 @@ import numpy as np + from ethology.reid.core.handler import ReIDHandler + def test_extract_features_shape(): - handler = ReIDHandler(weights='osnet_x0_25_imagenet.pth') + handler = ReIDHandler(weights="osnet_x0_25_imagenet.pth") frame = np.random.randint(0, 255, (128, 64, 3), dtype=np.uint8) - dets = np.array([ - [10, 10, 50, 100, 0.9, 1], - [60, 20, 100, 110, 0.8, 2], - ]) + dets = np.array( + [ + [10, 10, 50, 100, 0.9, 1], + [60, 20, 100, 110, 0.8, 2], + ] + ) feats = handler.extract_features(frame, dets) assert feats.shape[0] == dets.shape[0] From 045b9df95bf8a5987e188d2acddeefd32ccfd1a4 Mon Sep 17 00:00:00 2001 From: AnandMayank Date: Wed, 18 Feb 2026 19:21:29 +0530 Subject: [PATCH 07/12] style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues --- ethology/reid/backbones/hacnn.py | 578 ++++++------ ethology/reid/backbones/mlfn.py | 8 +- ethology/reid/backbones/mobilenetv2.py | 456 +++++----- ethology/reid/backbones/osnet.py | 800 +++++++---------- ethology/reid/backbones/osnet_ain.py | 831 +++++++----------- ethology/reid/backbones/resnet.py | 271 ++++++ ethology/reid/backends/base_backend.py | 48 +- ethology/reid/backends/onnx_backend.py | 55 +- ethology/reid/backends/openvino_backend.py | 83 +- ethology/reid/backends/pytorch_backend.py | 28 +- ethology/reid/backends/tensorrt_backend.py | 688 +++++++-------- ethology/reid/backends/tflite_backend.py | 64 +- ethology/reid/backends/torchscript_backend.py | 25 +- ethology/reid/core/auto_backend.py | 127 ++- ethology/reid/core/config.py | 28 +- ethology/reid/core/factory.py | 56 +- ethology/reid/core/handler.py | 21 +- ethology/reid/core/registry.py | 133 ++- ethology/reid/core/reid_handler.py | 51 +- tests/test_unit/test_reid_handler.py | 14 +- 20 files changed, 2024 insertions(+), 2341 deletions(-) diff --git a/ethology/reid/backbones/hacnn.py b/ethology/reid/backbones/hacnn.py index 9394ad30..f3a65746 100644 --- a/ethology/reid/backbones/hacnn.py +++ b/ethology/reid/backbones/hacnn.py @@ -1,5 +1,6 @@ -# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +"""HACNN backbone for person re-identification.""" +from __future__ import absolute_import, division import torch from torch import nn @@ -9,315 +10,298 @@ class ConvBlock(nn.Module): - def __init__(self, in_c, out_c, k, s=1, p=0): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) - self.bn = nn.BatchNorm2d(out_c) - - def forward(self, x): - return F.relu(self.bn(self.conv(x))) - + def __init__(self, in_c, out_c, k, s=1, p=0): + """Convolutional block with batch norm and ReLU.""" + super().__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) + self.bn = nn.BatchNorm2d(out_c) + def forward(self, x): + return F.relu(self.bn(self.conv(x))) class InceptionA(nn.Module): - def __init__(self, in_channels, out_channels): - super(InceptionA, self).__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream3 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream4 = nn.Sequential( - nn.AvgPool2d(3, stride=1, padding=1), - ConvBlock(in_channels, mid_channels, 1), - ) - - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - s4 = self.stream4(x) - y = torch.cat([s1, s2, s3, s4], dim=1) - return y - + def __init__(self, in_channels, out_channels): + """InceptionA block.""" + super().__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream3 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream4 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1), + ConvBlock(in_channels, mid_channels, 1), + ) + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + s4 = self.stream4(x) + y = torch.cat([s1, s2, s3, s4], dim=1) + return y class InceptionB(nn.Module): - def __init__(self, in_channels, out_channels): - super(InceptionB, self).__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream3 = nn.Sequential( - nn.MaxPool2d(3, stride=2, padding=1), - ConvBlock(in_channels, mid_channels * 2, 1), - ) - - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - y = torch.cat([s1, s2, s3], dim=1) - return y - + def __init__(self, in_channels, out_channels): + """InceptionB block.""" + super().__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream3 = nn.Sequential( + nn.MaxPool2d(3, stride=2, padding=1), + ConvBlock(in_channels, mid_channels * 2, 1), + ) + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + y = torch.cat([s1, s2, s3], dim=1) + return y class SpatialAttn(nn.Module): - def __init__(self): - super(SpatialAttn, self).__init__() - self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) - self.conv2 = ConvBlock(1, 1, 1) - - def forward(self, x): - x = x.mean(1, keepdim=True) - x = self.conv1(x) - x = F.interpolate( - x, - (x.size(2) * 2, x.size(3) * 2), - mode="bilinear", - align_corners=True, - ) - x = self.conv2(x) - return x - + def __init__(self): + """Spatial attention block.""" + super().__init__() + self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) + self.conv2 = ConvBlock(1, 1, 1) + def forward(self, x): + x = x.mean(1, keepdim=True) + x = self.conv1(x) + x = F.interpolate( + x, (x.size(2) * 2, x.size(3) * 2), mode="bilinear", align_corners=True + ) + x = self.conv2(x) + return x class ChannelAttn(nn.Module): - def __init__(self, in_channels, reduction_rate=16): - super(ChannelAttn, self).__init__() - assert in_channels % reduction_rate == 0 - self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) - self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) - - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]) - x = self.conv1(x) - x = self.conv2(x) - return x - + def __init__(self, in_channels, reduction_rate=16): + """Channel attention block.""" + super().__init__() + assert in_channels % reduction_rate == 0 + self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) + self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]) + x = self.conv1(x) + x = self.conv2(x) + return x class SoftAttn(nn.Module): - def __init__(self, in_channels): - super(SoftAttn, self).__init__() - self.spatial_attn = SpatialAttn() - self.channel_attn = ChannelAttn(in_channels) - self.conv = ConvBlock(in_channels, in_channels, 1) - - def forward(self, x): - y_spatial = self.spatial_attn(x) - y_channel = self.channel_attn(x) - y = y_spatial * y_channel - y = torch.sigmoid(self.conv(y)) - return y - + def __init__(self, in_channels): + """Soft attention block.""" + super().__init__() + self.spatial_attn = SpatialAttn() + self.channel_attn = ChannelAttn(in_channels) + self.conv = ConvBlock(in_channels, in_channels, 1) + def forward(self, x): + y_spatial = self.spatial_attn(x) + y_channel = self.channel_attn(x) + y = y_spatial * y_channel + y = torch.sigmoid(self.conv(y)) + return y class HardAttn(nn.Module): - def __init__(self, in_channels): - super(HardAttn, self).__init__() - self.fc = nn.Linear(in_channels, 4 * 2) - self.init_params() - - def init_params(self): - self.fc.weight.data.zero_() - self.fc.bias.data.copy_( - torch.tensor( - [0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float - ) - ) - - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) - theta = torch.tanh(self.fc(x)) - theta = theta.view(-1, 4, 2) - return theta - + def __init__(self, in_channels): + """Hard attention block.""" + super().__init__() + self.fc = nn.Linear(in_channels, 4 * 2) + self.init_params() + def init_params(self): + self.fc.weight.data.zero_() + self.fc.bias.data.copy_( + torch.tensor([0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float) + ) + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) + theta = torch.tanh(self.fc(x)) + theta = theta.view(-1, 4, 2) + return theta class HarmAttn(nn.Module): - def __init__(self, in_channels): - super(HarmAttn, self).__init__() - self.soft_attn = SoftAttn(in_channels) - self.hard_attn = HardAttn(in_channels) - - def forward(self, x): - y_soft_attn = self.soft_attn(x) - theta = self.hard_attn(x) - return y_soft_attn, theta - + def __init__(self, in_channels): + """Harmonious attention block.""" + super().__init__() + self.soft_attn = SoftAttn(in_channels) + self.hard_attn = HardAttn(in_channels) + def forward(self, x): + y_soft_attn = self.soft_attn(x) + theta = self.hard_attn(x) + return y_soft_attn, theta class HACNN(nn.Module): - def __init__( - self, - num_classes, - loss="softmax", - nchannels=[128, 256, 384], - feat_dim=512, - learn_region=True, - use_gpu=True, - **kwargs, - ): - super(HACNN, self).__init__() - self.loss = loss - self.learn_region = learn_region - self.use_gpu = use_gpu - self.conv = ConvBlock(3, 32, 3, s=2, p=1) - self.inception1 = nn.Sequential( - InceptionA(32, nchannels[0]), - InceptionB(nchannels[0], nchannels[0]), - ) - self.ha1 = HarmAttn(nchannels[0]) - self.inception2 = nn.Sequential( - InceptionA(nchannels[0], nchannels[1]), - InceptionB(nchannels[1], nchannels[1]), - ) - self.ha2 = HarmAttn(nchannels[1]) - self.inception3 = nn.Sequential( - InceptionA(nchannels[1], nchannels[2]), - InceptionB(nchannels[2], nchannels[2]), - ) - self.ha3 = HarmAttn(nchannels[2]) - self.fc_global = nn.Sequential( - nn.Linear(nchannels[2], feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_global = nn.Linear(feat_dim, num_classes) - if self.learn_region: - self.init_scale_factors() - self.local_conv1 = InceptionB(32, nchannels[0]) - self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) - self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) - self.fc_local = nn.Sequential( - nn.Linear(nchannels[2] * 4, feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_local = nn.Linear(feat_dim, num_classes) - self.feat_dim = feat_dim * 2 - else: - self.feat_dim = feat_dim - - def init_scale_factors(self): - self.scale_factors = [] - self.scale_factors.append( - torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) - ) - self.scale_factors.append( - torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) - ) - self.scale_factors.append( - torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) - ) - self.scale_factors.append( - torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) - ) - - def stn(self, x, theta): - grid = F.affine_grid(theta, x.size()) - x = F.grid_sample(x, grid) - return x - - def transform_theta(self, theta_i, region_idx): - scale_factors = self.scale_factors[region_idx] - theta = torch.zeros(theta_i.size(0), 2, 3) - theta[:, :, :2] = scale_factors - theta[:, :, -1] = theta_i - if self.use_gpu: - theta = theta.to(next(self.parameters()).device) - return theta - - def forward(self, x): - assert x.size(2) == 160 and x.size(3) == 64, ( - f"Input size does not match, expected (160, 64) but got ({x.size(2)}, {x.size(3)})" - ) - x = self.conv(x) - x1 = self.inception1(x) - x1_attn, x1_theta = self.ha1(x1) - x1_out = x1 * x1_attn - if self.learn_region: - x1_local_list = [] - for region_idx in range(4): - x1_theta_i = x1_theta[:, region_idx, :] - x1_theta_i = self.transform_theta(x1_theta_i, region_idx) - x1_trans_i = self.stn(x, x1_theta_i) - x1_trans_i = F.interpolate( - x1_trans_i, (24, 28), mode="bilinear", align_corners=True - ) - x1_local_i = self.local_conv1(x1_trans_i) - x1_local_list.append(x1_local_i) - x2 = self.inception2(x1_out) - x2_attn, x2_theta = self.ha2(x2) - x2_out = x2 * x2_attn - if self.learn_region: - x2_local_list = [] - for region_idx in range(4): - x2_theta_i = x2_theta[:, region_idx, :] - x2_theta_i = self.transform_theta(x2_theta_i, region_idx) - x2_trans_i = self.stn(x1_out, x2_theta_i) - x2_trans_i = F.interpolate( - x2_trans_i, (12, 14), mode="bilinear", align_corners=True - ) - x2_local_i = x2_trans_i + x1_local_list[region_idx] - x2_local_i = self.local_conv2(x2_local_i) - x2_local_list.append(x2_local_i) - x3 = self.inception3(x2_out) - x3_attn, x3_theta = self.ha3(x3) - x3_out = x3 * x3_attn - if self.learn_region: - x3_local_list = [] - for region_idx in range(4): - x3_theta_i = x3_theta[:, region_idx, :] - x3_theta_i = self.transform_theta(x3_theta_i, region_idx) - x3_trans_i = self.stn(x2_out, x3_theta_i) - x3_trans_i = F.interpolate( - x3_trans_i, (6, 7), mode="bilinear", align_corners=True - ) - x3_local_i = x3_trans_i + x2_local_list[region_idx] - x3_local_i = self.local_conv3(x3_local_i) - x3_local_list.append(x3_local_i) - x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( - x3_out.size(0), x3_out.size(1) - ) - x_global = self.fc_global(x_global) - if self.learn_region: - x_local_list = [] - for region_idx in range(4): - x_local_i = x3_local_list[region_idx] - x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( - x_local_i.size(0), -1 - ) - x_local_list.append(x_local_i) - x_local = torch.cat(x_local_list, 1) - x_local = self.fc_local(x_local) - if not self.training: - if self.learn_region: - x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) - x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) - return torch.cat([x_global, x_local], 1) - else: - return x_global - prelogits_global = self.classifier_global(x_global) - if self.learn_region: - prelogits_local = self.classifier_local(x_local) - if self.loss == "softmax": - if self.learn_region: - return (prelogits_global, prelogits_local) - else: - return prelogits_global - elif self.loss == "triplet": - if self.learn_region: - return (prelogits_global, prelogits_local), (x_global, x_local) - else: - return prelogits_global, x_global - else: - raise KeyError(f"Unsupported loss: {self.loss}") + def __init__( + self, + num_classes, + loss="softmax", + nchannels=None, + feat_dim=512, + learn_region=True, + use_gpu=True, + **kwargs, + ): + """Harmonious Attention Convolutional Neural Network (HACNN) for person re-identification.""" + super().__init__() + if nchannels is None: + nchannels = [128, 256, 384] + self.loss = loss + self.learn_region = learn_region + self.use_gpu = use_gpu + self.conv = ConvBlock(3, 32, 3, s=2, p=1) + self.inception1 = nn.Sequential( + InceptionA(32, nchannels[0]), + InceptionB(nchannels[0], nchannels[0]), + ) + self.ha1 = HarmAttn(nchannels[0]) + self.inception2 = nn.Sequential( + InceptionA(nchannels[0], nchannels[1]), + InceptionB(nchannels[1], nchannels[1]), + ) + self.ha2 = HarmAttn(nchannels[1]) + self.inception3 = nn.Sequential( + InceptionA(nchannels[1], nchannels[2]), + InceptionB(nchannels[2], nchannels[2]), + ) + self.ha3 = HarmAttn(nchannels[2]) + self.fc_global = nn.Sequential( + nn.Linear(nchannels[2], feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_global = nn.Linear(feat_dim, num_classes) + if self.learn_region: + self.init_scale_factors() + self.local_conv1 = InceptionB(32, nchannels[0]) + self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) + self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) + self.fc_local = nn.Sequential( + nn.Linear(nchannels[2] * 4, feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_local = nn.Linear(feat_dim, num_classes) + self.feat_dim = feat_dim * 2 + else: + self.feat_dim = feat_dim + def init_scale_factors(self): + """Initialize scale factors for STN.""" + self.scale_factors = [] + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) + def stn(self, x, theta): + """Spatial transformer network.""" + grid = F.affine_grid(theta, x.size()) + x = F.grid_sample(x, grid) + return x + def transform_theta(self, theta_i, region_idx): + """Transform theta for a given region.""" + scale_factors = self.scale_factors[region_idx] + theta = torch.zeros(theta_i.size(0), 2, 3) + theta[:, :, :2] = scale_factors + theta[:, :, -1] = theta_i + if self.use_gpu: + theta = theta.to(next(self.parameters()).device) + return theta + def forward(self, x): + """Forward pass.""" + assert ( + x.size(2) == 160 and x.size(3) == 64 + ), ( + f"Input size does not match, expected (160, 64) but got ({x.size(2)}, {x.size(3)})" + ) + x = self.conv(x) + x1 = self.inception1(x) + x1_attn, x1_theta = self.ha1(x1) + x1_out = x1 * x1_attn + if self.learn_region: + x1_local_list = [] + for region_idx in range(4): + x1_theta_i = x1_theta[:, region_idx, :] + x1_theta_i = self.transform_theta(x1_theta_i, region_idx) + x1_trans_i = self.stn(x, x1_theta_i) + x1_trans_i = F.interpolate( + x1_trans_i, (24, 28), mode="bilinear", align_corners=True + ) + x1_local_i = self.local_conv1(x1_trans_i) + x1_local_list.append(x1_local_i) + x2 = self.inception2(x1_out) + x2_attn, x2_theta = self.ha2(x2) + x2_out = x2 * x2_attn + if self.learn_region: + x2_local_list = [] + for region_idx in range(4): + x2_theta_i = x2_theta[:, region_idx, :] + x2_theta_i = self.transform_theta(x2_theta_i, region_idx) + x2_trans_i = self.stn(x1_out, x2_theta_i) + x2_trans_i = F.interpolate( + x2_trans_i, (12, 14), mode="bilinear", align_corners=True + ) + x2_local_i = x2_trans_i + x1_local_list[region_idx] + x2_local_i = self.local_conv2(x2_local_i) + x2_local_list.append(x2_local_i) + x3 = self.inception3(x2_out) + x3_attn, x3_theta = self.ha3(x3) + x3_out = x3 * x3_attn + if self.learn_region: + x3_local_list = [] + for region_idx in range(4): + x3_theta_i = x3_theta[:, region_idx, :] + x3_theta_i = self.transform_theta(x3_theta_i, region_idx) + x3_trans_i = self.stn(x2_out, x3_theta_i) + x3_trans_i = F.interpolate( + x3_trans_i, (6, 7), mode="bilinear", align_corners=True + ) + x3_local_i = x3_trans_i + x2_local_list[region_idx] + x3_local_i = self.local_conv3(x3_local_i) + x3_local_list.append(x3_local_i) + x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( + x3_out.size(0), x3_out.size(1) + ) + x_global = self.fc_global(x_global) + if self.learn_region: + x_local_list = [] + for region_idx in range(4): + x_local_i = x3_local_list[region_idx] + x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( + x_local_i.size(0), -1 + ) + x_local_list.append(x_local_i) + x_local = torch.cat(x_local_list, 1) + x_local = self.fc_local(x_local) + if not self.training: + if self.learn_region: + x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) + x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) + return torch.cat([x_global, x_local], 1) + else: + return x_global + prelogits_global = self.classifier_global(x_global) + if self.learn_region: + prelogits_local = self.classifier_local(x_local) + if self.loss == "softmax": + if self.learn_region: + return (prelogits_global, prelogits_local) + else: + return prelogits_global + elif self.loss == "triplet": + if self.learn_region: + return (prelogits_global, prelogits_local), (x_global, x_local) + else: + return prelogits_global, x_global + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index 8daad863..334bd1c8 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -1,5 +1,7 @@ -# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +"""MLFN backbone for person re-identification.""" + +from __future__ import absolute_import, division import torch import torch.utils.model_zoo as model_zoo from torch import nn @@ -7,8 +9,8 @@ __all__ = ["mlfn"] model_urls = { - # training epoch = 5, top1 = 51.6 - "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", + # training epoch = 5, top1 = 51.6 + "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", } diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py index b3e69186..35a16219 100644 --- a/ethology/reid/backbones/mobilenetv2.py +++ b/ethology/reid/backbones/mobilenetv2.py @@ -1,5 +1,6 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +from __future__ import absolute_import, division import torch.utils.model_zoo as model_zoo from torch import nn @@ -8,272 +9,239 @@ __all__ = ["mobilenetv2_x1_0", "mobilenetv2_x1_4"] model_urls = { - # 1.0: top-1 71.3 - "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", - # 1.4: top-1 73.9 - "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", + # 1.0: top-1 71.3 + "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", + # 1.4: top-1 73.9 + "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", } class ConvBlock(nn.Module): - """Basic convolutional block. + """Basic convolutional block. - convolution (bias discarded) + batch normalization + relu6. + convolution (bias discarded) + batch normalization + relu6. - Args: - in_c (int): number of input channels. - out_c (int): number of output channels. - k (int or tuple): kernel size. - s (int or tuple): stride. - p (int or tuple): padding. - g (int): number of blocked connections from input channels - to output channels (default: 1). + Args: + in_c (int): number of input channels. + out_c (int): number of output channels. + k (int or tuple): kernel size. + s (int or tuple): stride. + p (int or tuple): padding. + g (int): number of blocked connections from input channels + to output channels (default: 1). + """ - """ + def __init__(self, in_c, out_c, k, s=1, p=0, g=1): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) + self.bn = nn.BatchNorm2d(out_c) - def __init__(self, in_c, out_c, k, s=1, p=0, g=1): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d( - in_c, out_c, k, stride=s, padding=p, bias=False, groups=g - ) - self.bn = nn.BatchNorm2d(out_c) - - def forward(self, x): - return F.relu6(self.bn(self.conv(x))) + def forward(self, x): + return F.relu6(self.bn(self.conv(x))) class Bottleneck(nn.Module): - def __init__(self, in_channels, out_channels, expansion_factor, stride=1): - super(Bottleneck, self).__init__() - mid_channels = in_channels * expansion_factor - self.use_residual = stride == 1 and in_channels == out_channels - self.conv1 = ConvBlock(in_channels, mid_channels, 1) - self.dwconv2 = ConvBlock( - mid_channels, mid_channels, 3, stride, 1, g=mid_channels - ) - self.conv3 = nn.Sequential( - nn.Conv2d(mid_channels, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels), - ) - - def forward(self, x): - m = self.conv1(x) - m = self.dwconv2(m) - m = self.conv3(m) - if self.use_residual: - return x + m - else: - return m + def __init__(self, in_channels, out_channels, expansion_factor, stride=1): + super(Bottleneck, self).__init__() + mid_channels = in_channels * expansion_factor + self.use_residual = stride == 1 and in_channels == out_channels + self.conv1 = ConvBlock(in_channels, mid_channels, 1) + self.dwconv2 = ConvBlock( + mid_channels, mid_channels, 3, stride, 1, g=mid_channels + ) + self.conv3 = nn.Sequential( + nn.Conv2d(mid_channels, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + m = self.conv1(x) + m = self.dwconv2(m) + m = self.conv3(m) + if self.use_residual: + return x + m + else: + return m class MobileNetV2(nn.Module): - """MobileNetV2. - - Reference: - Sandler et al. MobileNetV2: Inverted Residuals and - Linear Bottlenecks. CVPR 2018. - - Public keys: - - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. - - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. - """ - - def __init__( - self, - num_classes, - width_mult=1, - loss="softmax", - fc_dims=None, - dropout_p=None, - **kwargs, - ): - super(MobileNetV2, self).__init__() - self.loss = loss - self.in_channels = int(32 * width_mult) - self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 - - # construct layers - self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) - self.conv2 = self._make_layer( - Bottleneck, 1, int(16 * width_mult), 1, 1 - ) - self.conv3 = self._make_layer( - Bottleneck, 6, int(24 * width_mult), 2, 2 - ) - self.conv4 = self._make_layer( - Bottleneck, 6, int(32 * width_mult), 3, 2 - ) - self.conv5 = self._make_layer( - Bottleneck, 6, int(64 * width_mult), 4, 2 - ) - self.conv6 = self._make_layer( - Bottleneck, 6, int(96 * width_mult), 3, 1 - ) - self.conv7 = self._make_layer( - Bottleneck, 6, int(160 * width_mult), 3, 2 - ) - self.conv8 = self._make_layer( - Bottleneck, 6, int(320 * width_mult), 1, 1 - ) - self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) - - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer( - fc_dims, self.feature_dim, dropout_p - ) - self.classifier = nn.Linear(self.feature_dim, num_classes) - - self._init_params() - - def _make_layer(self, block, t, c, n, s): - # t: expansion factor - # c: output channels - # n: number of blocks - # s: stride for first layer - layers = [] - layers.append(block(self.in_channels, c, t, s)) - self.in_channels = c - for i in range(1, n): - layers.append(block(self.in_channels, c, t)) - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - """Constructs fully connected layer. - - Args: - fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed - input_dim (int): input dimension - dropout_p (float): dropout probability, if None, dropout is unused - - """ - if fc_dims is None: - self.feature_dim = input_dim - return None - - assert isinstance(fc_dims, (list, tuple)), ( - f"fc_dims must be either list or tuple, but got {type(fc_dims)}" - ) - - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - - self.feature_dim = fc_dims[-1] - - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_( - m.weight, mode="fan_out", nonlinearity="relu" - ) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d) or isinstance( - m, nn.BatchNorm1d - ): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - x = self.conv6(x) - x = self.conv7(x) - x = self.conv8(x) - x = self.conv9(x) - return x - - def forward(self, x): - f = self.featuremaps(x) - v = self.global_avgpool(f) - v = v.view(v.size(0), -1) - - if self.fc is not None: - v = self.fc(v) - - if not self.training: - return v - - y = self.classifier(v) - - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError(f"Unsupported loss: {self.loss}") + """MobileNetV2. + + Reference: + Sandler et al. MobileNetV2: Inverted Residuals and + Linear Bottlenecks. CVPR 2018. + + Public keys: + - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. + - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. + """ + + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super(MobileNetV2, self).__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + + # construct layers + self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) + self.conv2 = self._make_layer(Bottleneck, 1, int(16 * width_mult), 1, 1) + self.conv3 = self._make_layer(Bottleneck, 6, int(24 * width_mult), 2, 2) + self.conv4 = self._make_layer(Bottleneck, 6, int(32 * width_mult), 3, 2) + self.conv5 = self._make_layer(Bottleneck, 6, int(64 * width_mult), 4, 2) + self.conv6 = self._make_layer(Bottleneck, 6, int(96 * width_mult), 3, 1) + self.conv7 = self._make_layer(Bottleneck, 6, int(160 * width_mult), 3, 2) + self.conv8 = self._make_layer(Bottleneck, 6, int(320 * width_mult), 1, 1) + self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) + + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(fc_dims, self.feature_dim, dropout_p) + self.classifier = nn.Linear(self.feature_dim, num_classes) + + self._init_params() + + def _make_layer(self, block, t, c, n, s): + # t: expansion factor + # c: output channels + # n: number of blocks + # s: stride for first layer + layers = [] + layers.append(block(self.in_channels, c, t, s)) + self.in_channels = c + for i in range(1, n): + layers.append(block(self.in_channels, c, t)) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + """Constructs fully connected layer. + + Args: + fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed + input_dim (int): input dimension + dropout_p (float): dropout probability, if None, dropout is unused + """ + if fc_dims is None: + self.feature_dim = input_dim + return None + + assert isinstance( + fc_dims, (list, tuple) + ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) + + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + + self.feature_dim = fc_dims[-1] + + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + x = self.conv6(x) + x = self.conv7(x) + x = self.conv8(x) + x = self.conv9(x) + return x + + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + + if self.fc is not None: + v = self.fc(v) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) def init_pretrained_weights(model, model_url): - """Initializes model with pretrained weights. + """Initializes model with pretrained weights. - Layers that don't match with pretrained layers in name or size are kept unchanged. - """ - pretrain_dict = model_zoo.load_url(model_url) - model_dict = model.state_dict() - pretrain_dict = { - k: v - for k, v in pretrain_dict.items() - if k in model_dict and model_dict[k].size() == v.size() - } - model_dict.update(pretrain_dict) - model.load_state_dict(model_dict) + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) def mobilenetv2_x1_0(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, - loss=loss, - width_mult=1, - fc_dims=None, - dropout_p=None, - **kwargs, - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_0"] - ) - ) - return model - + model = MobileNetV2( + num_classes, loss=loss, width_mult=1, fc_dims=None, dropout_p=None, **kwargs + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) + import warnings -def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, - loss=loss, - width_mult=1.4, - fc_dims=None, - dropout_p=None, - **kwargs, - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_4"] - ) - ) - return model + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_0"] + ) + ) + return model +def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs): + model = MobileNetV2( + num_classes, loss=loss, width_mult=1.4, fc_dims=None, dropout_p=None, **kwargs + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_4"] + ) + ) + return model # Copied from boxmot/boxmot/reid/backbones/mobilenetv2.py diff --git a/ethology/reid/backbones/osnet.py b/ethology/reid/backbones/osnet.py index c13dd5b7..c07e4e45 100644 --- a/ethology/reid/backbones/osnet.py +++ b/ethology/reid/backbones/osnet.py @@ -1,5 +1,6 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +from __future__ import absolute_import, division import warnings @@ -7,529 +8,338 @@ from torch import nn from torch.nn import functional as F -__all__ = [ - "osnet_x1_0", - "osnet_x0_75", - "osnet_x0_5", - "osnet_x0_25", - "osnet_ibn_x1_0", -] +__all__ = ["osnet_x1_0", "osnet_x0_75", "osnet_x0_5", "osnet_x0_25", "osnet_ibn_x1_0"] pretrained_urls = { - "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", - "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", - "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", - "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", - "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", + "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", + "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", + "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", + "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", + "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, ChannelGate, OSBlock... - class ConvLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - groups=1, - IN=False, - ): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - bias=False, - groups=groups, - ) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x - + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 1, - stride=stride, - padding=0, - bias=False, - groups=groups, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x - + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d( - in_channels, out_channels, 1, stride=stride, padding=0, bias=False - ) - self.bn = nn.BatchNorm2d(out_channels) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return x - + def __init__(self, in_channels, out_channels, stride=1): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) + self.bn = nn.BatchNorm2d(out_channels) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 3, - stride=stride, - padding=1, - bias=False, - groups=groups, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x - + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d( - in_channels, out_channels, 1, stride=1, padding=0, bias=False - ) - self.conv2 = nn.Conv2d( - out_channels, - out_channels, - 3, - stride=1, - padding=1, - bias=False, - groups=out_channels, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - x = self.relu(x) - return x - + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + x = self.relu(x) + return x class ChannelGate(nn.Module): - def __init__( - self, - in_channels, - num_gates=None, - return_gates=False, - gate_activation="sigmoid", - reduction=16, - layer_norm=False, - ): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d( - in_channels, - in_channels // reduction, - kernel_size=1, - bias=True, - padding=0, - ) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU(inplace=True) - self.fc2 = nn.Conv2d( - in_channels // reduction, - num_gates, - kernel_size=1, - bias=True, - padding=0, - ) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU(inplace=True) - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError(f"Unknown gate activation: {gate_activation}") - - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x - + def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU(inplace=True) + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x class OSBlock(nn.Module): - def __init__( - self, - in_channels, - out_channels, - IN=False, - bottleneck_reduction=4, - **kwargs, - ): - super(OSBlock, self).__init__() - mid_channels = out_channels // bottleneck_reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2a = LightConv3x3(mid_channels, mid_channels) - self.conv2b = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2c = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2d = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = None - if IN: - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2a = self.conv2a(x1) - x2b = self.conv2b(x1) - x2c = self.conv2c(x1) - x2d = self.conv2d(x1) - x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - if self.IN is not None: - out = self.IN(out) - return F.relu(out) - + def __init__(self, in_channels, out_channels, IN=False, bottleneck_reduction=4, **kwargs): + super(OSBlock, self).__init__() + mid_channels = out_channels // bottleneck_reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2a = LightConv3x3(mid_channels, mid_channels) + self.conv2b = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2c = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2d = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = None + if IN: + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2a = self.conv2a(x1) + x2b = self.conv2b(x1) + x2c = self.conv2c(x1) + x2d = self.conv2d(x1) + x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + if self.IN is not None: + out = self.IN(out) + return F.relu(out) class OSNet(nn.Module): - def __init__( - self, - num_classes, - blocks, - layers, - channels, - feature_dim=512, - loss="softmax", - IN=False, - **kwargs, - ): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer( - blocks[0], - layers[0], - channels[0], - channels[1], - reduce_spatial_size=True, - IN=IN, - ) - self.conv3 = self._make_layer( - blocks[1], - layers[1], - channels[1], - channels[2], - reduce_spatial_size=True, - ) - self.conv4 = self._make_layer( - blocks[2], - layers[2], - channels[2], - channels[3], - reduce_spatial_size=False, - ) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer( - self.feature_dim, channels[3], dropout_p=None - ) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - - def _make_layer( - self, - block, - layer, - in_channels, - out_channels, - reduce_spatial_size, - IN=False, - ): - layers = [] - layers.append(block(in_channels, out_channels, IN=IN)) - for i in range(1, layer): - layers.append(block(out_channels, out_channels, IN=IN)) - if reduce_spatial_size: - layers.append( - nn.Sequential( - Conv1x1(out_channels, out_channels), - nn.AvgPool2d(2, stride=2), - ) - ) - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_( - m.weight, mode="fan_out", nonlinearity="relu" - ) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d) or isinstance( - m, nn.BatchNorm1d - ): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError(f"Unsupported loss: {self.loss}") - + def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", IN=False, **kwargs): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1], reduce_spatial_size=True, IN=IN) + self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2], reduce_spatial_size=True) + self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3], reduce_spatial_size=False) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + def _make_layer(self, block, layer, in_channels, out_channels, reduce_spatial_size, IN=False): + layers = [] + layers.append(block(in_channels, out_channels, IN=IN)) + for i in range(1, layer): + layers.append(block(out_channels, out_channels, IN=IN)) + if reduce_spatial_size: + layers.append(nn.Sequential(Conv1x1(out_channels, out_channels), nn.AvgPool2d(2, stride=2))) + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) def init_pretrained_weights(model, key=""): - import os - from collections import OrderedDict - - import gdown - - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join( - os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" - ), - ) - ) - return torch_home - - filename = key + "_imagenet.pth" - # Try ethology/models/ directory first - ethology_root = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../") - ) - models_dir = os.path.join(ethology_root, "models") - os.makedirs(models_dir, exist_ok=True) - local_file = os.path.join(models_dir, filename) - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - os.makedirs(model_dir, exist_ok=True) - cached_file = os.path.join(model_dir, filename) - # Prefer ethology/models/ directory file if present - if os.path.exists(local_file): - print(f"[OSNet] Loading model weights from {local_file}") - cached_file = local_file - elif os.path.exists(cached_file): - print(f"[OSNet] Loading model weights from {cached_file}") - else: - print(f"[OSNet] Downloading model weights to {cached_file}") - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - f'The pretrained weights from "{cached_file}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)" - ) - else: - print( - f'Successfully loaded imagenet pretrained weights from "{cached_file}"' - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - f"due to unmatched keys or layer size: {discarded_layers}" - ) - + import errno + import os + from collections import OrderedDict + import gdown + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), + ) + ) + return torch_home + filename = key + "_imagenet.pth" + # Try ethology/models/ directory first + ethology_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) + models_dir = os.path.join(ethology_root, "models") + os.makedirs(models_dir, exist_ok=True) + local_file = os.path.join(models_dir, filename) + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + os.makedirs(model_dir, exist_ok=True) + cached_file = os.path.join(model_dir, filename) + # Prefer ethology/models/ directory file if present + if os.path.exists(local_file): + print(f"[OSNet] Loading model weights from {local_file}") + cached_file = local_file + elif os.path.exists(cached_file): + print(f"[OSNet] Loading model weights from {cached_file}") + else: + print(f"[OSNet] Downloading model weights to {cached_file}") + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights from "{}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)".format(cached_file) + ) + else: + print( + 'Successfully loaded imagenet pretrained weights from "{}"'.format( + cached_file + ) + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + "due to unmatched keys or layer size: {}".format(discarded_layers) + ) def osnet_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet( - num_classes, - blocks=[OSBlock, OSBlock, OSBlock], - layers=[2, 2, 2], - channels=[64, 256, 384, 512], - loss=loss, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_x1_0") - return model - + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x1_0") + return model def osnet_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet( - num_classes, - blocks=[OSBlock, OSBlock, OSBlock], - layers=[2, 2, 2], - channels=[48, 192, 288, 384], - loss=loss, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_75") - return model - + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_75") + return model def osnet_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet( - num_classes, - blocks=[OSBlock, OSBlock, OSBlock], - layers=[2, 2, 2], - channels=[32, 128, 192, 256], - loss=loss, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_5") - return model - + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_5") + return model def osnet_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet( - num_classes, - blocks=[OSBlock, OSBlock, OSBlock], - layers=[2, 2, 2], - channels=[16, 64, 96, 128], - loss=loss, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_25") - return model - - -def osnet_ibn_x1_0( - num_classes=1000, pretrained=True, loss="softmax", **kwargs -): - model = OSNet( - num_classes, - blocks=[OSBlock, OSBlock, OSBlock], - layers=[2, 2, 2], - channels=[64, 256, 384, 512], - loss=loss, - IN=True, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_ibn_x1_0") - return model + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_25") + return model + +def osnet_ibn_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ibn_x1_0") + return model diff --git a/ethology/reid/backbones/osnet_ain.py b/ethology/reid/backbones/osnet_ain.py index 2ef3da25..9e052209 100644 --- a/ethology/reid/backbones/osnet_ain.py +++ b/ethology/reid/backbones/osnet_ain.py @@ -1,5 +1,6 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +from __future__ import absolute_import, division import warnings @@ -7,541 +8,349 @@ from torch import nn from torch.nn import functional as F -__all__ = [ - "osnet_ain_x1_0", - "osnet_ain_x0_75", - "osnet_ain_x0_5", - "osnet_ain_x0_25", -] +__all__ = ["osnet_ain_x1_0", "osnet_ain_x0_75", "osnet_ain_x0_5", "osnet_ain_x0_25"] pretrained_urls = { - "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", - "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", - "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", - "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", + "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", + "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", + "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", + "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, LightConvStream, ChannelGate, OSBlock, OSBlockINin, OSNet, init_pretrained_weights, and instantiation functions... - class ConvLayer(nn.Module): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - groups=1, - IN=False, - ): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - bias=False, - groups=groups, - ) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 1, - stride=stride, - padding=0, - bias=False, - groups=groups, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, bn=True): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d( - in_channels, out_channels, 1, stride=stride, padding=0, bias=False - ) - self.bn = None - if bn: - self.bn = nn.BatchNorm2d(out_channels) - - def forward(self, x): - x = self.conv(x) - if self.bn is not None: - x = self.bn(x) - return x - + def __init__(self, in_channels, out_channels, stride=1, bn=True): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) + self.bn = None + if bn: + self.bn = nn.BatchNorm2d(out_channels) + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + return x class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d( - in_channels, - out_channels, - 3, - stride=stride, - padding=1, - bias=False, - groups=groups, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) - + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d( - in_channels, out_channels, 1, stride=1, padding=0, bias=False - ) - self.conv2 = nn.Conv2d( - out_channels, - out_channels, - 3, - stride=1, - padding=1, - bias=False, - groups=out_channels, - ) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - return self.relu(x) - + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + return self.relu(x) class LightConvStream(nn.Module): - def __init__(self, in_channels, out_channels, depth): - super(LightConvStream, self).__init__() - assert depth >= 1 - layers = [LightConv3x3(in_channels, out_channels)] - for i in range(depth - 1): - layers.append(LightConv3x3(out_channels, out_channels)) - self.layers = nn.Sequential(*layers) - - def forward(self, x): - return self.layers(x) - + def __init__(self, in_channels, out_channels, depth): + super(LightConvStream, self).__init__() + assert depth >= 1 + layers = [LightConv3x3(in_channels, out_channels)] + for i in range(depth - 1): + layers.append(LightConv3x3(out_channels, out_channels)) + self.layers = nn.Sequential(*layers) + def forward(self, x): + return self.layers(x) class ChannelGate(nn.Module): - def __init__( - self, - in_channels, - num_gates=None, - return_gates=False, - gate_activation="sigmoid", - reduction=16, - layer_norm=False, - ): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d( - in_channels, - in_channels // reduction, - kernel_size=1, - bias=True, - padding=0, - ) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU() - self.fc2 = nn.Conv2d( - in_channels // reduction, - num_gates, - kernel_size=1, - bias=True, - padding=0, - ) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU() - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError(f"Unknown gate activation: {gate_activation}") - - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x - + def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU() + self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU() + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x class OSBlock(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlock, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList( - [ - LightConvStream(mid_channels, mid_channels, t) - for t in range(1, T + 1) - ] - ) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) - + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlock, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) class OSBlockINin(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlockINin, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList( - [ - LightConvStream(mid_channels, mid_channels, t) - for t in range(1, T + 1) - ] - ) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - x3 = self.IN(x3) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) - + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlockINin, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + x3 = self.IN(x3) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) class OSNet(nn.Module): - def __init__( - self, - num_classes, - blocks, - layers, - channels, - feature_dim=512, - loss="softmax", - conv1_IN=False, - **kwargs, - ): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer( - 3, channels[0], 7, stride=2, padding=3, IN=conv1_IN - ) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer( - blocks[0], layers[0], channels[0], channels[1] - ) - self.pool2 = nn.Sequential( - Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2) - ) - self.conv3 = self._make_layer( - blocks[1], layers[1], channels[1], channels[2] - ) - self.pool3 = nn.Sequential( - Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2) - ) - self.conv4 = self._make_layer( - blocks[2], layers[2], channels[2], channels[3] - ) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer( - self.feature_dim, channels[3], dropout_p=None - ) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - - def _make_layer(self, blocks, layer, in_channels, out_channels): - layers = [] - layers += [blocks[0](in_channels, out_channels)] - for i in range(1, len(blocks)): - layers += [blocks[i](out_channels, out_channels)] - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU()) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_( - m.weight, mode="fan_out", nonlinearity="relu" - ) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif ( - isinstance(m, nn.BatchNorm2d) - or isinstance(m, nn.BatchNorm1d) - or isinstance(m, nn.InstanceNorm2d) - ): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.pool2(x) - x = self.conv3(x) - x = self.pool3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError(f"Unsupported loss: {self.loss}") - + def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", conv1_IN=False, **kwargs): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=conv1_IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1]) + self.pool2 = nn.Sequential(Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2)) + self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2]) + self.pool3 = nn.Sequential(Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2)) + self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3]) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + def _make_layer(self, blocks, layer, in_channels, out_channels): + layers = [] + layers += [blocks[0](in_channels, out_channels)] + for i in range(1, len(blocks)): + layers += [blocks[i](out_channels, out_channels)] + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU()) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.InstanceNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.pool2(x) + x = self.conv3(x) + x = self.pool3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) def init_pretrained_weights(model, key=""): - import errno - import os - from collections import OrderedDict - - import gdown - - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join( - os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" - ), - ) - ) - return torch_home - - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - try: - os.makedirs(model_dir) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - filename = key + "_imagenet.pth" - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - f'The pretrained weights from "{cached_file}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)" - ) - else: - print( - f'Successfully loaded imagenet pretrained weights from "{cached_file}"' - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - f"due to unmatched keys or layer size: {discarded_layers}" - ) - - -def osnet_ain_x1_0( - num_classes=1000, pretrained=True, loss="softmax", **kwargs -): - model = OSNet( - num_classes, - blocks=[ - [OSBlockINin, OSBlockINin], - [OSBlock, OSBlockINin], - [OSBlockINin, OSBlock], - ], - layers=[2, 2, 2], - channels=[64, 256, 384, 512], - loss=loss, - conv1_IN=True, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x1_0") - return model - - -def osnet_ain_x0_75( - num_classes=1000, pretrained=True, loss="softmax", **kwargs -): - model = OSNet( - num_classes, - blocks=[ - [OSBlockINin, OSBlockINin], - [OSBlock, OSBlockINin], - [OSBlockINin, OSBlock], - ], - layers=[2, 2, 2], - channels=[48, 192, 288, 384], - loss=loss, - conv1_IN=True, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_75") - return model - - -def osnet_ain_x0_5( - num_classes=1000, pretrained=True, loss="softmax", **kwargs -): - model = OSNet( - num_classes, - blocks=[ - [OSBlockINin, OSBlockINin], - [OSBlock, OSBlockINin], - [OSBlockINin, OSBlock], - ], - layers=[2, 2, 2], - channels=[32, 128, 192, 256], - loss=loss, - conv1_IN=True, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_5") - return model - - -def osnet_ain_x0_25( - num_classes=1000, pretrained=True, loss="softmax", **kwargs -): - model = OSNet( - num_classes, - blocks=[ - [OSBlockINin, OSBlockINin], - [OSBlock, OSBlockINin], - [OSBlockINin, OSBlock], - ], - layers=[2, 2, 2], - channels=[16, 64, 96, 128], - loss=loss, - conv1_IN=True, - **kwargs, - ) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_25") - return model + import errno + import os + from collections import OrderedDict + import gdown + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), + ) + ) + return torch_home + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + try: + os.makedirs(model_dir) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + filename = key + "_imagenet.pth" + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file): + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + 'The pretrained weights from "{}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)".format(cached_file) + ) + else: + print( + 'Successfully loaded imagenet pretrained weights from "{}"'.format( + cached_file + ) + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + "due to unmatched keys or layer size: {}".format(discarded_layers) + ) + +def osnet_ain_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x1_0") + return model + +def osnet_ain_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_75") + return model + +def osnet_ain_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_5") + return model + +def osnet_ain_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): + model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, conv1_IN=True, **kwargs) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_25") + return model diff --git a/ethology/reid/backbones/resnet.py b/ethology/reid/backbones/resnet.py index 80eda98e..7cf28df1 100644 --- a/ethology/reid/backbones/resnet.py +++ b/ethology/reid/backbones/resnet.py @@ -1,11 +1,19 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license +<<<<<<< HEAD """Code source: https://github.com/pytorch/vision""" +======= +""" +Code source: https://github.com/pytorch/vision +""" +from __future__ import absolute_import, division +>>>>>>> a4dd694 (style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues) import torch.utils.model_zoo as model_zoo from torch import nn __all__ = [ +<<<<<<< HEAD "resnet18", "resnet34", "resnet50", @@ -24,10 +32,31 @@ "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", +======= + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", + "resnext50_32x4d", + "resnext101_32x8d", + "resnet50_fc512", +] + +model_urls = { + "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", + "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", + "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", +>>>>>>> a4dd694 (style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues) } # ...existing code for conv3x3, conv1x1, BasicBlock, Bottleneck, ResNet, init_pretrained_weights, and instantiation functions... +<<<<<<< HEAD def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): return nn.Conv2d( @@ -459,3 +488,245 @@ def resnet50_fc512(num_classes, loss="softmax", pretrained=True, **kwargs): if pretrained: init_pretrained_weights(model, model_urls["resnet50"]) return model +======= +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, + ) + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + +class BasicBlock(nn.Module): + expansion = 1 + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError("BasicBlock only supports groups=1 and base_width=64") + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + +class Bottleneck(nn.Module): + expansion = 4 + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.0)) * groups + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv3(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + +class ResNet(nn.Module): + def __init__(self, num_classes, loss, block, layers, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None, last_stride=2, fc_dims=None, dropout_p=None, **kwargs): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + self.loss = loss + self.feature_dim = 512 * block.expansion + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=last_stride, dilate=replace_stride_with_dilation[2]) + self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = self._construct_fc_layer(fc_dims, 512 * block.expansion, dropout_p) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) + return nn.Sequential(*layers) + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None: + self.feature_dim = input_dim + return None + assert isinstance(fc_dims, (list, tuple)), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + def featuremaps(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError("Unsupported loss: {}".format(self.loss)) + +def init_pretrained_weights(model, model_url): + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size()} + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) + +def resnet18(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[2, 2, 2, 2], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet18"]) + return model + +def resnet34(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=BasicBlock, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet34"]) + return model + +def resnet50(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model + +def resnet101(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet101"]) + return model + +def resnet152(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 8, 36, 3], last_stride=2, fc_dims=None, dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet152"]) + return model + +def resnext50_32x4d(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=4, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnext50_32x4d"]) + return model + +def resnext101_32x8d(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 23, 3], last_stride=2, fc_dims=None, dropout_p=None, groups=32, width_per_group=8, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnext101_32x8d"]) + return model + +def resnet50_fc512(num_classes, loss="softmax", pretrained=True, **kwargs): + model = ResNet(num_classes=num_classes, loss=loss, block=Bottleneck, layers=[3, 4, 6, 3], last_stride=1, fc_dims=[512], dropout_p=None, **kwargs) + if pretrained: + init_pretrained_weights(model, model_urls["resnet50"]) + return model +>>>>>>> a4dd694 (style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues) diff --git a/ethology/reid/backends/base_backend.py b/ethology/reid/backends/base_backend.py index 0edb2826..688ec43a 100644 --- a/ethology/reid/backends/base_backend.py +++ b/ethology/reid/backends/base_backend.py @@ -1,19 +1,18 @@ + +import os from abc import abstractmethod from pathlib import Path - import cv2 import gdown import numpy as np import torch from filelock import SoftFileLock - from ethology.reid.core.registry import ReIDModelRegistry - # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.checks import RequirementsChecker # If needed, implement or set RequirementsChecker - class BaseModelBackend: + def __init__(self, weights, device, half): self.weights = weights[0] if isinstance(weights, list) else weights if isinstance(self.weights, str): @@ -23,7 +22,7 @@ def __init__(self, weights, device, half): self.half = half self.model = None # Support both string and torch.device for device - if hasattr(self.device, "type"): + if hasattr(self.device, 'type'): self.cuda = torch.cuda.is_available() and self.device.type != "cpu" else: self.cuda = torch.cuda.is_available() and self.device != "cpu" @@ -42,19 +41,11 @@ def __init__(self, weights, device, half): self.load_model(self.weights) - self.mean_array = torch.tensor( - [0.485, 0.456, 0.406], device=self.device - ).view(1, 3, 1, 1) - self.std_array = torch.tensor( - [0.229, 0.224, 0.225], device=self.device - ).view(1, 3, 1, 1) + self.mean_array = torch.tensor([0.485, 0.456, 0.406], device=self.device).view(1, 3, 1, 1) + self.std_array = torch.tensor([0.229, 0.224, 0.225], device=self.device).view(1, 3, 1, 1) if "clip" in self.model_name: - self.mean_array = torch.tensor( - [0.5, 0.5, 0.5], device=self.device - ).view(1, 3, 1, 1) - self.std_array = torch.tensor( - [0.5, 0.5, 0.5], device=self.device - ).view(1, 3, 1, 1) + self.mean_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) + self.std_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) if "vehicleid" in self.weights.name or "veri" in self.weights.name: input_shape = (256, 256) @@ -66,6 +57,7 @@ def __init__(self, weights, device, half): input_shape = (256, 128) self.input_shape = input_shape + def get_crops(self, xyxys, img): h, w = img.shape[:2] interpolation_method = cv2.INTER_LINEAR @@ -93,6 +85,7 @@ def get_crops(self, xyxys, img): crops = (crops - self.mean_array) / self.std_array return crops + @torch.no_grad() def get_features(self, xyxys, img): if xyxys.size != 0: @@ -105,6 +98,7 @@ def get_features(self, xyxys, img): features = features / np.linalg.norm(features, axis=-1, keepdims=True) return features + def warmup(self, imgsz=[(256, 128, 3)]): if self.device.type != "cpu": im = np.random.randint(0, 255, *imgsz, dtype=np.uint8) @@ -114,9 +108,11 @@ def warmup(self, imgsz=[(256, 128, 3)]): crops = self.inference_preprocess(crops) self.forward(crops) + def to_numpy(self, x): return x.cpu().numpy() if isinstance(x, torch.Tensor) else x + def inference_preprocess(self, x): if self.half: if isinstance(x, torch.Tensor): @@ -125,34 +121,32 @@ def inference_preprocess(self, x): elif isinstance(x, np.ndarray): if x.dtype != np.float16: x = x.astype(np.float16) - if hasattr(self, "nhwc") and self.nhwc: + if hasattr(self, 'nhwc') and self.nhwc: if isinstance(x, torch.Tensor): x = x.permute(0, 2, 3, 1) elif isinstance(x, np.ndarray): x = np.transpose(x, (0, 2, 3, 1)) return x + def inference_postprocess(self, features): if isinstance(features, (list, tuple)): return ( - self.to_numpy(features[0]) - if len(features) == 1 - else [self.to_numpy(x) for x in features] + self.to_numpy(features[0]) if len(features) == 1 else [self.to_numpy(x) for x in features] ) else: return self.to_numpy(features) + @abstractmethod def forward(self, im_batch): - raise NotImplementedError( - "This method should be implemented by subclasses." - ) + raise NotImplementedError("This method should be implemented by subclasses.") + @abstractmethod def load_model(self, w): - raise NotImplementedError( - "This method should be implemented by subclasses." - ) + raise NotImplementedError("This method should be implemented by subclasses.") + def download_model(self, w): if isinstance(w, str): diff --git a/ethology/reid/backends/onnx_backend.py b/ethology/reid/backends/onnx_backend.py index 41aefb7f..c7c93017 100644 --- a/ethology/reid/backends/onnx_backend.py +++ b/ethology/reid/backends/onnx_backend.py @@ -1,34 +1,31 @@ -from ethology.reid.backends.base_backend import BaseModelBackend +from ethology.reid.backends.base_backend import BaseModelBackend class ONNXBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - - def load_model(self, w): - # ONNXRuntime will attempt to use the first provider, and if it fails or is not - # available for some reason, it will fall back to the next provider in the list - if self.device.type == "mps": - # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) - providers = ["MPSExecutionProvider", "CPUExecutionProvider"] - elif self.device.type == "cuda": - # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) - providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] - else: - # self.checker.check_packages(("onnxruntime==1.18.1",)) - providers = ["CPUExecutionProvider"] - import onnxruntime + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half - self.session = onnxruntime.InferenceSession( - str(w), providers=providers - ) + def load_model(self, w): + # ONNXRuntime will attempt to use the first provider, and if it fails or is not + # available for some reason, it will fall back to the next provider in the list + if self.device.type == "mps": + # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) + providers = ["MPSExecutionProvider", "CPUExecutionProvider"] + elif self.device.type == "cuda": + # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + else: + # self.checker.check_packages(("onnxruntime==1.18.1",)) + providers = ["CPUExecutionProvider"] + import onnxruntime + self.session = onnxruntime.InferenceSession(str(w), providers=providers) - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() - features = self.session.run( - [self.session.get_outputs()[0].name], - {self.session.get_inputs()[0].name: im_batch}, - )[0] - return features + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() + features = self.session.run( + [self.session.get_outputs()[0].name], + {self.session.get_inputs()[0].name: im_batch}, + )[0] + return features diff --git a/ethology/reid/backends/openvino_backend.py b/ethology/reid/backends/openvino_backend.py index 0c56a06e..f06392bf 100644 --- a/ethology/reid/backends/openvino_backend.py +++ b/ethology/reid/backends/openvino_backend.py @@ -1,49 +1,48 @@ from pathlib import Path from ethology.reid.backends.base_backend import BaseModelBackend - # Note: LOGGER can be replaced with print or a local logger if needed - class OpenVinoBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - - def load_model(self, w): - # self.checker.check_packages(("openvino>=2025.2.0",)) - - print(f"Loading {w} for OpenVINO inference...") - try: - # requires openvino-dev: https://pypi.org/project/openvino-dev/ - from openvino import Core, Layout - except ImportError: - print( - f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" - "requires openvino pip package to be installed!\n" - "$ pip install openvino>=2025.2.0\n" - ) - raise - ie = Core() - w = Path(w) - print(w) - if w.suffix == ".bin": - w = w.with_suffix(".xml") - - if not w.is_file(): # if not *.xml - w = next( - Path(w).glob("*.xml") - ) # get *.xml file from *_openvino_model dir - network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) - if network.get_parameters()[0].get_layout().empty: - network.get_parameters()[0].set_layout(Layout("NCWH")) - self.executable_network = ie.compile_model( - network, device_name="CPU" - ) # device_name="MYRIAD" for Intel NCS2 - self.output_layer = next(iter(self.executable_network.outputs)) - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() # FP32 - features = self.executable_network([im_batch])[self.output_layer] - return features + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # self.checker.check_packages(("openvino>=2025.2.0",)) + + print(f"Loading {w} for OpenVINO inference...") + try: + # requires openvino-dev: https://pypi.org/project/openvino-dev/ + from openvino import Core, Layout + except ImportError: + print( + f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" + "requires openvino pip package to be installed!\n" + "$ pip install openvino>=2025.2.0\n" + ) + raise + ie = Core() + w = Path(w) + print(w) + if w.suffix == '.bin': + w = w.with_suffix('.xml') + + if not w.is_file(): # if not *.xml + w = next( + Path(w).glob("*.xml") + ) # get *.xml file from *_openvino_model dir + network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) + if network.get_parameters()[0].get_layout().empty: + network.get_parameters()[0].set_layout(Layout("NCWH")) + self.executable_network = ie.compile_model( + network, device_name="CPU" + ) # device_name="MYRIAD" for Intel NCS2 + self.output_layer = next(iter(self.executable_network.outputs)) + + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() # FP32 + features = self.executable_network([im_batch])[self.output_layer] + return features diff --git a/ethology/reid/backends/pytorch_backend.py b/ethology/reid/backends/pytorch_backend.py index 2e859cc8..d3dbfa06 100644 --- a/ethology/reid/backends/pytorch_backend.py +++ b/ethology/reid/backends/pytorch_backend.py @@ -1,20 +1,20 @@ from ethology.reid.backends.base_backend import BaseModelBackend from ethology.reid.core.registry import ReIDModelRegistry - class PyTorchBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - def load_model(self, w): - # Load a PyTorch model - if w and w.is_file(): - ReIDModelRegistry.load_pretrained_weights(self.model, w) - self.model.to(self.device).eval() - self.model.half() if self.half else self.model.float() + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # Load a PyTorch model + if w and w.is_file(): + ReIDModelRegistry.load_pretrained_weights(self.model, w) + self.model.to(self.device).eval() + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/backends/tensorrt_backend.py b/ethology/reid/backends/tensorrt_backend.py index 4f6e95b0..8dd7d7ee 100644 --- a/ethology/reid/backends/tensorrt_backend.py +++ b/ethology/reid/backends/tensorrt_backend.py @@ -1,400 +1,310 @@ -# Note: LOGGER can be replaced with print or a local logger if needed -import os from collections import OrderedDict, namedtuple import numpy as np import torch from ethology.reid.backends.base_backend import BaseModelBackend +# Note: LOGGER can be replaced with print or a local logger if needed + +import os +import sys +import torch +import numpy as np +from collections import namedtuple, OrderedDict + + Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) class TensorRTBackend(BaseModelBackend): - def __init__(self, engine_path, device=None): - import hashlib - - import requests - - self.device = device or ( - torch.device("cuda") - if torch.cuda.is_available() - else torch.device("cpu") - ) - self.fp16 = False - self.model_ = None - self.context = None - self.bindings = None - self.binding_addrs = None - self.is_trt10 = False - # Download engine if engine_path is a URL - if engine_path.startswith("http://") or engine_path.startswith( - "https://" - ): - # Use a hash of the URL for filename - engine_hash = hashlib.md5(engine_path.encode()).hexdigest() - filename = f"trt_engine_{engine_hash}.engine" - cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") - os.makedirs(cache_dir, exist_ok=True) - cached_file = os.path.join(cache_dir, filename) - if not os.path.exists(cached_file): - print( - f"[TensorRT] Downloading engine from {engine_path} to {cached_file}" - ) - with requests.get(engine_path, stream=True) as r: - r.raise_for_status() - with open(cached_file, "wb") as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - else: - print(f"[TensorRT] Using cached engine at {cached_file}") - self.engine_path = cached_file - else: - self.engine_path = engine_path - self.load_model(self.engine_path) - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - try: - import pycuda.autoinit # noqa: F401 - import pycuda.driver as cuda - import tensorrt as trt - except ImportError: - raise ImportError( - "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH." - ) - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError( - "CUDA device not available for TensorRT inference." - ) - - Binding = namedtuple( - "Binding", ("name", "dtype", "shape", "data", "ptr") - ) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f, trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = ( - range(self.model_.num_io_tensors) - if self.is_trt10 - else range(self.model_.num_bindings) - ) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = ( - self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - ) - if is_input and -1 in tuple( - self.model_.get_tensor_shape(name) - ): - self.context.set_input_shape( - name, - tuple( - self.model_.get_tensor_profile_shape(name, 0)[1] - ), - ) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = ( - self.model_.get_profile_shape(profile_index, index) - ) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( - self.device - ) - self.bindings[name] = Binding( - name, dtype, shape, data, int(data.data_ptr()) - ) - - self.binding_addrs = OrderedDict( - (n, d.ptr) for n, d in self.bindings.items() - ) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = ( - range(self.model_.num_io_tensors) - if self.is_trt10 - else range(self.model_.num_bindings) - ) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = ( - self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - ) - if is_input and -1 in tuple( - self.model_.get_tensor_shape(name) - ): - self.context.set_input_shape( - name, - tuple( - self.model_.get_tensor_profile_shape(name, 0)[1] - ), - ) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = ( - self.model_.get_profile_shape(profile_index, index) - ) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( - self.device - ) - self.bindings[name] = Binding( - name, dtype, shape, data, int(data.data_ptr()) - ) - - self.binding_addrs = OrderedDict( - (n, d.ptr) for n, d in self.bindings.items() - ) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace( - shape=temp_batch.shape - ) - self.bindings["output"].data.resize_( - tuple(self.context.get_tensor_shape("output")) - ) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace( - shape=temp_batch.shape - ) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, ( - f"Input size {temp_batch.shape} does not match model size {s}" - ) - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - # self.checker.check_packages(("nvidia-tensorrt",)) - try: - import tensorrt as trt # TensorRT library - except ImportError: - raise ImportError("Please install tensorrt to use this backend.") - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError( - "CUDA device not available for TensorRT inference." - ) - - Binding = namedtuple( - "Binding", ("name", "dtype", "shape", "data", "ptr") - ) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f, trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = ( - range(self.model_.num_io_tensors) - if self.is_trt10 - else range(self.model_.num_bindings) - ) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = ( - self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - ) - if is_input and -1 in tuple( - self.model_.get_tensor_shape(name) - ): - self.context.set_input_shape( - name, - tuple( - self.model_.get_tensor_profile_shape(name, 0)[1] - ), - ) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = ( - self.model_.get_profile_shape(profile_index, index) - ) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( - self.device - ) - self.bindings[name] = Binding( - name, dtype, shape, data, int(data.data_ptr()) - ) - - self.binding_addrs = OrderedDict( - (n, d.ptr) for n, d in self.bindings.items() - ) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace( - shape=temp_batch.shape - ) - self.bindings["output"].data.resize_( - tuple(self.context.get_tensor_shape("output")) - ) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace( - shape=temp_batch.shape - ) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, ( - f"Input size {temp_batch.shape} does not match model size {s}" - ) - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features + def __init__(self, engine_path, device=None): + import hashlib + import requests + self.device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) + self.fp16 = False + self.model_ = None + self.context = None + self.bindings = None + self.binding_addrs = None + self.is_trt10 = False + # Download engine if engine_path is a URL + if engine_path.startswith("http://") or engine_path.startswith("https://"): + # Use a hash of the URL for filename + engine_hash = hashlib.md5(engine_path.encode()).hexdigest() + filename = f"trt_engine_{engine_hash}.engine" + cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") + os.makedirs(cache_dir, exist_ok=True) + cached_file = os.path.join(cache_dir, filename) + if not os.path.exists(cached_file): + print(f"[TensorRT] Downloading engine from {engine_path} to {cached_file}") + with requests.get(engine_path, stream=True) as r: + r.raise_for_status() + with open(cached_file, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + else: + print(f"[TensorRT] Using cached engine at {cached_file}") + self.engine_path = cached_file + else: + self.engine_path = engine_path + self.load_model(self.engine_path) + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + try: + import tensorrt as trt + import pycuda.driver as cuda + import pycuda.autoinit # noqa: F401 + except ImportError: + raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError("CUDA device not available for TensorRT inference.") + + Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f: + with trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + # self.checker.check_packages(("nvidia-tensorrt",)) + try: + import tensorrt as trt # TensorRT library + except ImportError: + raise ImportError("Please install tensorrt to use this backend.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError("CUDA device not available for TensorRT inference.") + + Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f: + with trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): + self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) + self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) + + self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features diff --git a/ethology/reid/backends/tflite_backend.py b/ethology/reid/backends/tflite_backend.py index eb10d4e8..b0a7b707 100644 --- a/ethology/reid/backends/tflite_backend.py +++ b/ethology/reid/backends/tflite_backend.py @@ -4,39 +4,37 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend - # Note: LOGGER can be replaced with print or a local logger if needed - class TFLiteBackend(BaseModelBackend): - """A class to handle TensorFlow Lite model inference with dynamic batch size support.""" - - def __init__(self, weights: Path, device: str, half: bool): - super().__init__(weights, device, half) - self.nhwc = True - self.half = False - - def load_model(self, w): - # self.checker.check_packages(("tensorflow",)) - print(f"Loading {str(w)} for TensorFlow Lite inference...") - import tensorflow as tf - - self.interpreter = tf.lite.Interpreter(model_path=str(w)) - self.interpreter.allocate_tensors() - self.input_details = self.interpreter.get_input_details() - self.output_details = self.interpreter.get_output_details() - self.current_allocated_batch_size = self.input_details[0]["shape"][0] - - def forward(self, im_batch: torch.Tensor) -> np.ndarray: - im_batch = im_batch.cpu().numpy() - batch_size = im_batch.shape[0] - if batch_size != self.current_allocated_batch_size: - self.interpreter.resize_tensor_input( - self.input_details[0]["index"], [batch_size, 256, 128, 3] - ) - self.interpreter.allocate_tensors() - self.current_allocated_batch_size = batch_size - self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) - self.interpreter.invoke() - features = self.interpreter.get_tensor(self.output_details[0]["index"]) - return features + """ + A class to handle TensorFlow Lite model inference with dynamic batch size support. + """ + def __init__(self, weights: Path, device: str, half: bool): + super().__init__(weights, device, half) + self.nhwc = True + self.half = False + + def load_model(self, w): + # self.checker.check_packages(("tensorflow",)) + print(f"Loading {str(w)} for TensorFlow Lite inference...") + import tensorflow as tf + self.interpreter = tf.lite.Interpreter(model_path=str(w)) + self.interpreter.allocate_tensors() + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + self.current_allocated_batch_size = self.input_details[0]["shape"][0] + + def forward(self, im_batch: torch.Tensor) -> np.ndarray: + im_batch = im_batch.cpu().numpy() + batch_size = im_batch.shape[0] + if batch_size != self.current_allocated_batch_size: + self.interpreter.resize_tensor_input( + self.input_details[0]["index"], [batch_size, 256, 128, 3] + ) + self.interpreter.allocate_tensors() + self.current_allocated_batch_size = batch_size + self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) + self.interpreter.invoke() + features = self.interpreter.get_tensor(self.output_details[0]["index"]) + return features diff --git a/ethology/reid/backends/torchscript_backend.py b/ethology/reid/backends/torchscript_backend.py index 1142fcc4..b6602171 100644 --- a/ethology/reid/backends/torchscript_backend.py +++ b/ethology/reid/backends/torchscript_backend.py @@ -1,21 +1,20 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend - # Note: LOGGER can be replaced with print or a local logger if needed - class TorchscriptBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - def load_model(self, w): - print(f"Loading {w} for TorchScript inference...") - self.model = torch.jit.load(w) - self.model.half() if self.half else self.model.float() + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + print(f"Loading {w} for TorchScript inference...") + self.model = torch.jit.load(w) + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/core/auto_backend.py b/ethology/reid/core/auto_backend.py index 22f2c4e2..6f43eba2 100644 --- a/ethology/reid/core/auto_backend.py +++ b/ethology/reid/core/auto_backend.py @@ -1,89 +1,74 @@ -from pathlib import Path +from pathlib import Path +from typing import Tuple, Union import torch - from ethology.reid.backends.onnx_backend import ONNXBackend from ethology.reid.backends.openvino_backend import OpenVinoBackend from ethology.reid.backends.pytorch_backend import PyTorchBackend - try: - from ethology.reid.backends.tensorrt_backend import TensorRTBackend + from ethology.reid.backends.tensorrt_backend import TensorRTBackend except ImportError: - - class TensorRTBackend: - def __init__(self, *args, **kwargs): - raise ImportError( - "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH." - ) - - + class TensorRTBackend: + def __init__(self, *args, **kwargs): + raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH.") from ethology.reid.backends.tflite_backend import TFLiteBackend from ethology.reid.backends.torchscript_backend import TorchscriptBackend - # from ethology.reid.core import export_formats # If needed, implement or copy export_formats # from ethology.utils import WEIGHTS # If needed, implement or set WEIGHTS # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.torch_utils import select_device # If needed, implement or set select_device - class ReidAutoBackend: - def __init__( - self, - weights: Path, - device: torch.device = torch.device("cpu"), - half: bool = False, - ): - super().__init__() - w = weights[0] if isinstance(weights, list) else weights - ( - self.pt, - self.pth, - self.jit, - self.onnx, - self.xml, - self.engine, - self.tflite, - ) = self.model_type(w) - self.weights = weights - self.device = device # For simplicity, skip select_device for now - self.half = half - self.model = self.get_backend() + def __init__( + self, + weights: Path, + device: torch.device = torch.device("cpu"), + half: bool = False, + ): + super().__init__() + w = weights[0] if isinstance(weights, list) else weights + ( + self.pt, + self.pth, + self.jit, + self.onnx, + self.xml, + self.engine, + self.tflite, + ) = self.model_type(w) + self.weights = weights + self.device = device # For simplicity, skip select_device for now + self.half = half + self.model = self.get_backend() - def get_backend(self): - backend_map = { - self.pt or self.pth: PyTorchBackend, - self.jit: TorchscriptBackend, - self.onnx: ONNXBackend, - self.engine: TensorRTBackend, - self.xml: OpenVinoBackend, - self.tflite: TFLiteBackend, - } - for condition, backend_class in backend_map.items(): - if condition: - return backend_class(self.weights, self.device, self.half) - raise RuntimeError("This model framework is not supported yet!") + def get_backend(self): + backend_map = { + self.pt or self.pth: PyTorchBackend, + self.jit: TorchscriptBackend, + self.onnx: ONNXBackend, + self.engine: TensorRTBackend, + self.xml: OpenVinoBackend, + self.tflite: TFLiteBackend, + } + for condition, backend_class in backend_map.items(): + if condition: + return backend_class(self.weights, self.device, self.half) + raise RuntimeError("This model framework is not supported yet!") - def check_suffix( - self, - file: Path = "osnet_x0_25_msmt17.pt", - suffix: str | tuple[str, ...] = (".pt",), - msg: str = "", - ): - suffix = [suffix] if isinstance(suffix, str) else list(suffix) - files = [file] if isinstance(file, (str, Path)) else list(file) - for f in files: - file_suffix = Path(f).suffix.lower() - if file_suffix and file_suffix not in suffix: - print( - f"File {f} does not have an acceptable suffix. Expected: {suffix}" - ) + def check_suffix(self, file: Path = "osnet_x0_25_msmt17.pt", suffix: Union[str, Tuple[str, ...]] = (".pt",), msg: str = ""): + suffix = [suffix] if isinstance(suffix, str) else list(suffix) + files = [file] if isinstance(file, (str, Path)) else list(file) + for f in files: + file_suffix = Path(f).suffix.lower() + if file_suffix and file_suffix not in suffix: + print(f"File {f} does not have an acceptable suffix. Expected: {suffix}") - def model_type(self, p: Path) -> tuple[bool, ...]: - # For demo, just check for .pt - sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] - self.check_suffix(p, sf) - types = [str(Path(p)).endswith(s) for s in sf] - # OpenVINO explicit check - if Path(p).suffix in [".xml", ".bin"]: - types[3] = True - return tuple(types) + def model_type(self, p: Path) -> Tuple[bool, ...]: + # For demo, just check for .pt + sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] + self.check_suffix(p, sf) + types = [str(Path(p)).endswith(s) for s in sf] + # OpenVINO explicit check + if Path(p).suffix in ['.xml', '.bin']: + types[3] = True + return tuple(types) diff --git a/ethology/reid/core/config.py b/ethology/reid/core/config.py index dc17cc14..926c0cc9 100644 --- a/ethology/reid/core/config.py +++ b/ethology/reid/core/config.py @@ -1,16 +1,16 @@ MODEL_TYPES = [ - "resnet50", - "resnet101", - "mlfn", - "hacnn", - "mobilenetv2_x1_0", - "mobilenetv2_x1_4", - "osnet_x1_0", - "osnet_x0_75", - "osnet_x0_5", - "osnet_x0_25", - "osnet_ibn_x1_0", - "osnet_ain_x1_0", - "lmbn_n", - "clip", + "resnet50", + "resnet101", + "mlfn", + "hacnn", + "mobilenetv2_x1_0", + "mobilenetv2_x1_4", + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", + "osnet_ain_x1_0", + "lmbn_n", + "clip", ] diff --git a/ethology/reid/core/factory.py b/ethology/reid/core/factory.py index 27406383..bc8b6ab1 100644 --- a/ethology/reid/core/factory.py +++ b/ethology/reid/core/factory.py @@ -1,44 +1,30 @@ + # Import model constructors from ethology's local backbones from ethology.reid.backbones.hacnn import HACNN from ethology.reid.backbones.mlfn import mlfn -from ethology.reid.backbones.mobilenetv2 import ( - mobilenetv2_x1_0, - mobilenetv2_x1_4, -) -from ethology.reid.backbones.osnet import ( - osnet_ibn_x1_0, - osnet_x0_5, - osnet_x0_25, - osnet_x0_75, - osnet_x1_0, -) -from ethology.reid.backbones.osnet_ain import ( - osnet_ain_x0_5, - osnet_ain_x0_25, - osnet_ain_x0_75, - osnet_ain_x1_0, -) +from ethology.reid.backbones.mobilenetv2 import mobilenetv2_x1_0, mobilenetv2_x1_4 +from ethology.reid.backbones.osnet import osnet_ibn_x1_0, osnet_x0_5, osnet_x0_25, osnet_x0_75, osnet_x1_0 +from ethology.reid.backbones.osnet_ain import osnet_ain_x0_5, osnet_ain_x0_25, osnet_ain_x0_75, osnet_ain_x1_0 from ethology.reid.backbones.resnet import resnet50, resnet101 - # from ethology.reid.backbones.lmbn.lmbn_n import LMBN_n # If present # from ethology.reid.backbones.clip.make_model import make_model # If present MODEL_FACTORY = { - "resnet50": resnet50, - "resnet101": resnet101, - "mobilenetv2_x1_0": mobilenetv2_x1_0, - "mobilenetv2_x1_4": mobilenetv2_x1_4, - "hacnn": HACNN, - "mlfn": mlfn, - "osnet_x1_0": osnet_x1_0, - "osnet_x0_75": osnet_x0_75, - "osnet_x0_5": osnet_x0_5, - "osnet_x0_25": osnet_x0_25, - "osnet_ibn_x1_0": osnet_ibn_x1_0, - "osnet_ain_x1_0": osnet_ain_x1_0, - "osnet_ain_x0_75": osnet_ain_x0_75, - "osnet_ain_x0_5": osnet_ain_x0_5, - "osnet_ain_x0_25": osnet_ain_x0_25, - # "lmbn_n": LMBN_n, # Uncomment if implemented - # "clip": make_model, # Uncomment if implemented + "resnet50": resnet50, + "resnet101": resnet101, + "mobilenetv2_x1_0": mobilenetv2_x1_0, + "mobilenetv2_x1_4": mobilenetv2_x1_4, + "hacnn": HACNN, + "mlfn": mlfn, + "osnet_x1_0": osnet_x1_0, + "osnet_x0_75": osnet_x0_75, + "osnet_x0_5": osnet_x0_5, + "osnet_x0_25": osnet_x0_25, + "osnet_ibn_x1_0": osnet_ibn_x1_0, + "osnet_ain_x1_0": osnet_ain_x1_0, + "osnet_ain_x0_75": osnet_ain_x0_75, + "osnet_ain_x0_5": osnet_ain_x0_5, + "osnet_ain_x0_25": osnet_ain_x0_25, + # "lmbn_n": LMBN_n, # Uncomment if implemented + # "clip": make_model, # Uncomment if implemented } diff --git a/ethology/reid/core/handler.py b/ethology/reid/core/handler.py index ba521ab2..b5e51391 100644 --- a/ethology/reid/core/handler.py +++ b/ethology/reid/core/handler.py @@ -2,35 +2,32 @@ # Thin wrapper to use BoxMOT ReID models in ethology from pathlib import Path - +from typing import Union import numpy as np + # Import ethology's local ReID handler from ethology.reid.core.reid_handler import ReID as EthologyReID - class ReIDHandler: - """Ethology ReID handler using local models and backends.""" - - def __init__(self, weights: str | Path, device="cpu", half=False): + """ + Ethology ReID handler using local models and backends. + """ + def __init__(self, weights: Union[str, Path], device='cpu', half=False): self.model = EthologyReID(weights=weights, device=device, half=half) - def extract_features( - self, frame: np.ndarray, dets: np.ndarray - ) -> np.ndarray: - """Extract feature embeddings for detections in a frame. - + def extract_features(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """ + Extract feature embeddings for detections in a frame. Parameters ---------- frame : np.ndarray (H, W, C) BGR image. dets : np.ndarray (N, 6) array of detections (x1, y1, x2, y2, conf, cls). - Returns ------- np.ndarray (N, D) feature embeddings. - """ return self.model(frame, dets) diff --git a/ethology/reid/core/registry.py b/ethology/reid/core/registry.py index 4b9c27fd..333cff2f 100644 --- a/ethology/reid/core/registry.py +++ b/ethology/reid/core/registry.py @@ -1,88 +1,71 @@ -from collections import OrderedDict +from collections import OrderedDict import torch - -from ethology.reid.core.config import ( - MODEL_TYPES, # , NR_CLASSES_DICT, TRAINED_URLS -) +from ethology.reid.core.config import MODEL_TYPES #, NR_CLASSES_DICT, TRAINED_URLS from ethology.reid.core.factory import MODEL_FACTORY - # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER - class ReIDModelRegistry: - """Encapsulates model registration and related utilities.""" + """Encapsulates model registration and related utilities.""" - @staticmethod - def show_downloadable_models(): - # LOGGER.info("Available .pt ReID models for automatic download") - # LOGGER.info(list(TRAINED_URLS.keys())) - pass + @staticmethod + def show_downloadable_models(): + # LOGGER.info("Available .pt ReID models for automatic download") + # LOGGER.info(list(TRAINED_URLS.keys())) + pass - @staticmethod - def get_model_name(model): - for name in MODEL_TYPES: - if name in model.name: - return name - return None + @staticmethod + def get_model_name(model): + for name in MODEL_TYPES: + if name in model.name: + return name + return None - @staticmethod - def get_model_url(model): - # return TRAINED_URLS.get(model.name, None) - return None + @staticmethod + def get_model_url(model): + # return TRAINED_URLS.get(model.name, None) + return None - @staticmethod - def load_pretrained_weights(model, weight_path): - device = "cpu" if not torch.cuda.is_available() else None - checkpoint = torch.load( - weight_path, - map_location=torch.device("cpu") if device == "cpu" else None, - weights_only=False, - encoding="latin1", - ) - state_dict = checkpoint.get("state_dict", checkpoint) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - key = k[7:] if k.startswith("module.") else k - if key in model_dict and model_dict[key].size() == v.size(): - new_state_dict[key] = v - matched_layers.append(key) - else: - discarded_layers.append(key) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) + @staticmethod + def load_pretrained_weights(model, weight_path): + device = "cpu" if not torch.cuda.is_available() else None + checkpoint = torch.load( + weight_path, + map_location=torch.device("cpu") if device == "cpu" else None, + weights_only=False, + encoding='latin1', + ) + state_dict = checkpoint.get("state_dict", checkpoint) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + key = k[7:] if k.startswith("module.") else k + if key in model_dict and model_dict[key].size() == v.size(): + new_state_dict[key] = v + matched_layers.append(key) + else: + discarded_layers.append(key) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) - @staticmethod - def show_available_models(): - # LOGGER.info("Available models:") - # LOGGER.info(list(MODEL_FACTORY.keys())) - pass + @staticmethod + def show_available_models(): + # LOGGER.info("Available models:") + # LOGGER.info(list(MODEL_FACTORY.keys())) + pass - @staticmethod - def get_nr_classes(weights): - # dataset_key = weights.name.split("_")[1] - # return NR_CLASSES_DICT.get(dataset_key, 1) - return 1 + @staticmethod + def get_nr_classes(weights): + # dataset_key = weights.name.split("_")[1] + # return NR_CLASSES_DICT.get(dataset_key, 1) + return 1 - @staticmethod - def build_model( - name, - weights, - num_classes, - loss="softmax", - pretrained=True, - use_gpu=True, - ): - if name not in MODEL_FACTORY: - available = list(MODEL_FACTORY.keys()) - raise KeyError( - f"Unknown model '{name}'. Must be one of {available}" - ) - return MODEL_FACTORY[name]( - num_classes=num_classes, - loss=loss, - pretrained=pretrained, - use_gpu=use_gpu, - ) + @staticmethod + def build_model(name, weights, num_classes, loss="softmax", pretrained=True, use_gpu=True): + if name not in MODEL_FACTORY: + available = list(MODEL_FACTORY.keys()) + raise KeyError(f"Unknown model '{name}'. Must be one of {available}") + return MODEL_FACTORY[name]( + num_classes=num_classes, loss=loss, pretrained=pretrained, use_gpu=use_gpu + ) diff --git a/ethology/reid/core/reid_handler.py b/ethology/reid/core/reid_handler.py index 62d42209..2c72658a 100644 --- a/ethology/reid/core/reid_handler.py +++ b/ethology/reid/core/reid_handler.py @@ -1,33 +1,28 @@ -from pathlib import Path +from pathlib import Path +from typing import Union import numpy as np - from ethology.reid.core.auto_backend import ReidAutoBackend - class ReID: - def __init__(self, weights: str | Path, device="cpu", half=False): - self.weights = Path(weights) - self.device = device - self.half = half - self.backend = ReidAutoBackend( - weights=self.weights, device=device, half=half - ) - self.model = self.backend.model - - def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: - """Extract features for detections in a frame. - - Args: - frame: (H, W, C) BGR image - dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. - - Returns: - embs: (N, D) embeddings. - - """ - if dets.shape[0] == 0: - return np.empty((0, 0)) - xyxy = dets[:, :4] - embs = self.model.get_features(xyxy, frame) - return embs + def __init__(self, weights: Union[str, Path], device='cpu', half=False): + self.weights = Path(weights) + self.device = device + self.half = half + self.backend = ReidAutoBackend(weights=self.weights, device=device, half=half) + self.model = self.backend.model + + def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """ + Extract features for detections in a frame. + Args: + frame: (H, W, C) BGR image + dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. + Returns: + embs: (N, D) embeddings. + """ + if dets.shape[0] == 0: + return np.empty((0, 0)) + xyxy = dets[:, :4] + embs = self.model.get_features(xyxy, frame) + return embs diff --git a/tests/test_unit/test_reid_handler.py b/tests/test_unit/test_reid_handler.py index bc8a199c..3a5146cf 100644 --- a/tests/test_unit/test_reid_handler.py +++ b/tests/test_unit/test_reid_handler.py @@ -1,16 +1,12 @@ import numpy as np - from ethology.reid.core.handler import ReIDHandler - def test_extract_features_shape(): - handler = ReIDHandler(weights="osnet_x0_25_imagenet.pth") + handler = ReIDHandler(weights='osnet_x0_25_imagenet.pth') frame = np.random.randint(0, 255, (128, 64, 3), dtype=np.uint8) - dets = np.array( - [ - [10, 10, 50, 100, 0.9, 1], - [60, 20, 100, 110, 0.8, 2], - ] - ) + dets = np.array([ + [10, 10, 50, 100, 0.9, 1], + [60, 20, 100, 110, 0.8, 2], + ]) feats = handler.extract_features(frame, dets) assert feats.shape[0] == dets.shape[0] From 04d9a06e914bb40d3505102458a0968c1a4108f4 Mon Sep 17 00:00:00 2001 From: AnandMayank Date: Wed, 18 Feb 2026 20:08:13 +0530 Subject: [PATCH 08/12] docs(examples): add ReID trajectory utility example with MOT results --- examples/reid_mot_example.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 examples/reid_mot_example.py diff --git a/examples/reid_mot_example.py b/examples/reid_mot_example.py new file mode 100644 index 00000000..3862e1dd --- /dev/null +++ b/examples/reid_mot_example.py @@ -0,0 +1,23 @@ +""" +Example: Using the new ReID trajectory utility with MOT results + +This script demonstrates how to use the ethology reid trajectory handler with a sample MOT output. +""" + +from ethology.reid.core.reid_handler import ReIDTrajectoryHandler + +# Example: Dummy MOT results (replace with your actual MOT output) +mot_results = [ + {'id': 1, 'trajectory': [(0, 0), (1, 1), (2, 2)]}, + {'id': 2, 'trajectory': [(5, 5), (6, 6), (7, 7)]}, +] + +# Initialize the handler (adjust parameters as needed) +reid_handler = ReIDTrajectoryHandler(model_name='osnet', device='cpu') + +# Run re-identification on the MOT results +reid_results = reid_handler.reidentify(mot_results) + +print('ReID Results:') +for item in reid_results: + print(item) From eff1f28d540188be94217547a9335a9fb1d4e1ca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:39:07 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ethology/reid/backbones/hacnn.py | 592 +++++++------ ethology/reid/backbones/mlfn.py | 6 +- ethology/reid/backbones/mobilenetv2.py | 456 +++++----- ethology/reid/backbones/osnet.py | 800 ++++++++++------- ethology/reid/backbones/osnet_ain.py | 831 +++++++++++------- ethology/reid/backends/base_backend.py | 48 +- ethology/reid/backends/onnx_backend.py | 55 +- ethology/reid/backends/openvino_backend.py | 83 +- ethology/reid/backends/pytorch_backend.py | 28 +- ethology/reid/backends/tensorrt_backend.py | 688 ++++++++------- ethology/reid/backends/tflite_backend.py | 64 +- ethology/reid/backends/torchscript_backend.py | 25 +- ethology/reid/core/auto_backend.py | 127 +-- ethology/reid/core/config.py | 28 +- ethology/reid/core/factory.py | 56 +- ethology/reid/core/handler.py | 21 +- ethology/reid/core/registry.py | 133 +-- ethology/reid/core/reid_handler.py | 51 +- examples/reid_mot_example.py | 11 +- tests/test_unit/test_reid_handler.py | 14 +- 20 files changed, 2359 insertions(+), 1758 deletions(-) diff --git a/ethology/reid/backbones/hacnn.py b/ethology/reid/backbones/hacnn.py index f3a65746..27771cf7 100644 --- a/ethology/reid/backbones/hacnn.py +++ b/ethology/reid/backbones/hacnn.py @@ -1,7 +1,5 @@ """HACNN backbone for person re-identification.""" -from __future__ import absolute_import, division - import torch from torch import nn from torch.nn import functional as F @@ -10,298 +8,330 @@ class ConvBlock(nn.Module): - def __init__(self, in_c, out_c, k, s=1, p=0): - """Convolutional block with batch norm and ReLU.""" - super().__init__() - self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) - self.bn = nn.BatchNorm2d(out_c) - def forward(self, x): - return F.relu(self.bn(self.conv(x))) + def __init__(self, in_c, out_c, k, s=1, p=0): + """Convolutional block with batch norm and ReLU.""" + super().__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p) + self.bn = nn.BatchNorm2d(out_c) + + def forward(self, x): + return F.relu(self.bn(self.conv(x))) + class InceptionA(nn.Module): - def __init__(self, in_channels, out_channels): - """InceptionA block.""" - super().__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream3 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ) - self.stream4 = nn.Sequential( - nn.AvgPool2d(3, stride=1, padding=1), - ConvBlock(in_channels, mid_channels, 1), - ) - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - s4 = self.stream4(x) - y = torch.cat([s1, s2, s3, s4], dim=1) - return y + def __init__(self, in_channels, out_channels): + """InceptionA block.""" + super().__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream3 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ) + self.stream4 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1), + ConvBlock(in_channels, mid_channels, 1), + ) + + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + s4 = self.stream4(x) + y = torch.cat([s1, s2, s3, s4], dim=1) + return y + class InceptionB(nn.Module): - def __init__(self, in_channels, out_channels): - """InceptionB block.""" - super().__init__() - mid_channels = out_channels // 4 - self.stream1 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream2 = nn.Sequential( - ConvBlock(in_channels, mid_channels, 1), - ConvBlock(mid_channels, mid_channels, 3, p=1), - ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), - ) - self.stream3 = nn.Sequential( - nn.MaxPool2d(3, stride=2, padding=1), - ConvBlock(in_channels, mid_channels * 2, 1), - ) - def forward(self, x): - s1 = self.stream1(x) - s2 = self.stream2(x) - s3 = self.stream3(x) - y = torch.cat([s1, s2, s3], dim=1) - return y + def __init__(self, in_channels, out_channels): + """InceptionB block.""" + super().__init__() + mid_channels = out_channels // 4 + self.stream1 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream2 = nn.Sequential( + ConvBlock(in_channels, mid_channels, 1), + ConvBlock(mid_channels, mid_channels, 3, p=1), + ConvBlock(mid_channels, mid_channels, 3, s=2, p=1), + ) + self.stream3 = nn.Sequential( + nn.MaxPool2d(3, stride=2, padding=1), + ConvBlock(in_channels, mid_channels * 2, 1), + ) + + def forward(self, x): + s1 = self.stream1(x) + s2 = self.stream2(x) + s3 = self.stream3(x) + y = torch.cat([s1, s2, s3], dim=1) + return y + class SpatialAttn(nn.Module): - def __init__(self): - """Spatial attention block.""" - super().__init__() - self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) - self.conv2 = ConvBlock(1, 1, 1) - def forward(self, x): - x = x.mean(1, keepdim=True) - x = self.conv1(x) - x = F.interpolate( - x, (x.size(2) * 2, x.size(3) * 2), mode="bilinear", align_corners=True - ) - x = self.conv2(x) - return x + def __init__(self): + """Spatial attention block.""" + super().__init__() + self.conv1 = ConvBlock(1, 1, 3, s=2, p=1) + self.conv2 = ConvBlock(1, 1, 1) + + def forward(self, x): + x = x.mean(1, keepdim=True) + x = self.conv1(x) + x = F.interpolate( + x, + (x.size(2) * 2, x.size(3) * 2), + mode="bilinear", + align_corners=True, + ) + x = self.conv2(x) + return x + class ChannelAttn(nn.Module): - def __init__(self, in_channels, reduction_rate=16): - """Channel attention block.""" - super().__init__() - assert in_channels % reduction_rate == 0 - self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) - self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]) - x = self.conv1(x) - x = self.conv2(x) - return x + def __init__(self, in_channels, reduction_rate=16): + """Channel attention block.""" + super().__init__() + assert in_channels % reduction_rate == 0 + self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1) + self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1) + + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]) + x = self.conv1(x) + x = self.conv2(x) + return x + class SoftAttn(nn.Module): - def __init__(self, in_channels): - """Soft attention block.""" - super().__init__() - self.spatial_attn = SpatialAttn() - self.channel_attn = ChannelAttn(in_channels) - self.conv = ConvBlock(in_channels, in_channels, 1) - def forward(self, x): - y_spatial = self.spatial_attn(x) - y_channel = self.channel_attn(x) - y = y_spatial * y_channel - y = torch.sigmoid(self.conv(y)) - return y + def __init__(self, in_channels): + """Soft attention block.""" + super().__init__() + self.spatial_attn = SpatialAttn() + self.channel_attn = ChannelAttn(in_channels) + self.conv = ConvBlock(in_channels, in_channels, 1) + + def forward(self, x): + y_spatial = self.spatial_attn(x) + y_channel = self.channel_attn(x) + y = y_spatial * y_channel + y = torch.sigmoid(self.conv(y)) + return y + class HardAttn(nn.Module): - def __init__(self, in_channels): - """Hard attention block.""" - super().__init__() - self.fc = nn.Linear(in_channels, 4 * 2) - self.init_params() - def init_params(self): - self.fc.weight.data.zero_() - self.fc.bias.data.copy_( - torch.tensor([0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float) - ) - def forward(self, x): - x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) - theta = torch.tanh(self.fc(x)) - theta = theta.view(-1, 4, 2) - return theta + def __init__(self, in_channels): + """Hard attention block.""" + super().__init__() + self.fc = nn.Linear(in_channels, 4 * 2) + self.init_params() + + def init_params(self): + self.fc.weight.data.zero_() + self.fc.bias.data.copy_( + torch.tensor( + [0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float + ) + ) + + def forward(self, x): + x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1)) + theta = torch.tanh(self.fc(x)) + theta = theta.view(-1, 4, 2) + return theta + class HarmAttn(nn.Module): - def __init__(self, in_channels): - """Harmonious attention block.""" - super().__init__() - self.soft_attn = SoftAttn(in_channels) - self.hard_attn = HardAttn(in_channels) - def forward(self, x): - y_soft_attn = self.soft_attn(x) - theta = self.hard_attn(x) - return y_soft_attn, theta + def __init__(self, in_channels): + """Harmonious attention block.""" + super().__init__() + self.soft_attn = SoftAttn(in_channels) + self.hard_attn = HardAttn(in_channels) + + def forward(self, x): + y_soft_attn = self.soft_attn(x) + theta = self.hard_attn(x) + return y_soft_attn, theta + class HACNN(nn.Module): - def __init__( - self, - num_classes, - loss="softmax", - nchannels=None, - feat_dim=512, - learn_region=True, - use_gpu=True, - **kwargs, - ): - """Harmonious Attention Convolutional Neural Network (HACNN) for person re-identification.""" - super().__init__() - if nchannels is None: - nchannels = [128, 256, 384] - self.loss = loss - self.learn_region = learn_region - self.use_gpu = use_gpu - self.conv = ConvBlock(3, 32, 3, s=2, p=1) - self.inception1 = nn.Sequential( - InceptionA(32, nchannels[0]), - InceptionB(nchannels[0], nchannels[0]), - ) - self.ha1 = HarmAttn(nchannels[0]) - self.inception2 = nn.Sequential( - InceptionA(nchannels[0], nchannels[1]), - InceptionB(nchannels[1], nchannels[1]), - ) - self.ha2 = HarmAttn(nchannels[1]) - self.inception3 = nn.Sequential( - InceptionA(nchannels[1], nchannels[2]), - InceptionB(nchannels[2], nchannels[2]), - ) - self.ha3 = HarmAttn(nchannels[2]) - self.fc_global = nn.Sequential( - nn.Linear(nchannels[2], feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_global = nn.Linear(feat_dim, num_classes) - if self.learn_region: - self.init_scale_factors() - self.local_conv1 = InceptionB(32, nchannels[0]) - self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) - self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) - self.fc_local = nn.Sequential( - nn.Linear(nchannels[2] * 4, feat_dim), - nn.BatchNorm1d(feat_dim), - nn.ReLU(), - ) - self.classifier_local = nn.Linear(feat_dim, num_classes) - self.feat_dim = feat_dim * 2 - else: - self.feat_dim = feat_dim - def init_scale_factors(self): - """Initialize scale factors for STN.""" - self.scale_factors = [] - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float)) - def stn(self, x, theta): - """Spatial transformer network.""" - grid = F.affine_grid(theta, x.size()) - x = F.grid_sample(x, grid) - return x - def transform_theta(self, theta_i, region_idx): - """Transform theta for a given region.""" - scale_factors = self.scale_factors[region_idx] - theta = torch.zeros(theta_i.size(0), 2, 3) - theta[:, :, :2] = scale_factors - theta[:, :, -1] = theta_i - if self.use_gpu: - theta = theta.to(next(self.parameters()).device) - return theta - def forward(self, x): - """Forward pass.""" - assert ( - x.size(2) == 160 and x.size(3) == 64 - ), ( - f"Input size does not match, expected (160, 64) but got ({x.size(2)}, {x.size(3)})" - ) - x = self.conv(x) - x1 = self.inception1(x) - x1_attn, x1_theta = self.ha1(x1) - x1_out = x1 * x1_attn - if self.learn_region: - x1_local_list = [] - for region_idx in range(4): - x1_theta_i = x1_theta[:, region_idx, :] - x1_theta_i = self.transform_theta(x1_theta_i, region_idx) - x1_trans_i = self.stn(x, x1_theta_i) - x1_trans_i = F.interpolate( - x1_trans_i, (24, 28), mode="bilinear", align_corners=True - ) - x1_local_i = self.local_conv1(x1_trans_i) - x1_local_list.append(x1_local_i) - x2 = self.inception2(x1_out) - x2_attn, x2_theta = self.ha2(x2) - x2_out = x2 * x2_attn - if self.learn_region: - x2_local_list = [] - for region_idx in range(4): - x2_theta_i = x2_theta[:, region_idx, :] - x2_theta_i = self.transform_theta(x2_theta_i, region_idx) - x2_trans_i = self.stn(x1_out, x2_theta_i) - x2_trans_i = F.interpolate( - x2_trans_i, (12, 14), mode="bilinear", align_corners=True - ) - x2_local_i = x2_trans_i + x1_local_list[region_idx] - x2_local_i = self.local_conv2(x2_local_i) - x2_local_list.append(x2_local_i) - x3 = self.inception3(x2_out) - x3_attn, x3_theta = self.ha3(x3) - x3_out = x3 * x3_attn - if self.learn_region: - x3_local_list = [] - for region_idx in range(4): - x3_theta_i = x3_theta[:, region_idx, :] - x3_theta_i = self.transform_theta(x3_theta_i, region_idx) - x3_trans_i = self.stn(x2_out, x3_theta_i) - x3_trans_i = F.interpolate( - x3_trans_i, (6, 7), mode="bilinear", align_corners=True - ) - x3_local_i = x3_trans_i + x2_local_list[region_idx] - x3_local_i = self.local_conv3(x3_local_i) - x3_local_list.append(x3_local_i) - x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( - x3_out.size(0), x3_out.size(1) - ) - x_global = self.fc_global(x_global) - if self.learn_region: - x_local_list = [] - for region_idx in range(4): - x_local_i = x3_local_list[region_idx] - x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( - x_local_i.size(0), -1 - ) - x_local_list.append(x_local_i) - x_local = torch.cat(x_local_list, 1) - x_local = self.fc_local(x_local) - if not self.training: - if self.learn_region: - x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) - x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) - return torch.cat([x_global, x_local], 1) - else: - return x_global - prelogits_global = self.classifier_global(x_global) - if self.learn_region: - prelogits_local = self.classifier_local(x_local) - if self.loss == "softmax": - if self.learn_region: - return (prelogits_global, prelogits_local) - else: - return prelogits_global - elif self.loss == "triplet": - if self.learn_region: - return (prelogits_global, prelogits_local), (x_global, x_local) - else: - return prelogits_global, x_global - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + loss="softmax", + nchannels=None, + feat_dim=512, + learn_region=True, + use_gpu=True, + **kwargs, + ): + """Harmonious Attention Convolutional Neural Network (HACNN) for person re-identification.""" + super().__init__() + if nchannels is None: + nchannels = [128, 256, 384] + self.loss = loss + self.learn_region = learn_region + self.use_gpu = use_gpu + self.conv = ConvBlock(3, 32, 3, s=2, p=1) + self.inception1 = nn.Sequential( + InceptionA(32, nchannels[0]), + InceptionB(nchannels[0], nchannels[0]), + ) + self.ha1 = HarmAttn(nchannels[0]) + self.inception2 = nn.Sequential( + InceptionA(nchannels[0], nchannels[1]), + InceptionB(nchannels[1], nchannels[1]), + ) + self.ha2 = HarmAttn(nchannels[1]) + self.inception3 = nn.Sequential( + InceptionA(nchannels[1], nchannels[2]), + InceptionB(nchannels[2], nchannels[2]), + ) + self.ha3 = HarmAttn(nchannels[2]) + self.fc_global = nn.Sequential( + nn.Linear(nchannels[2], feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_global = nn.Linear(feat_dim, num_classes) + if self.learn_region: + self.init_scale_factors() + self.local_conv1 = InceptionB(32, nchannels[0]) + self.local_conv2 = InceptionB(nchannels[0], nchannels[1]) + self.local_conv3 = InceptionB(nchannels[1], nchannels[2]) + self.fc_local = nn.Sequential( + nn.Linear(nchannels[2] * 4, feat_dim), + nn.BatchNorm1d(feat_dim), + nn.ReLU(), + ) + self.classifier_local = nn.Linear(feat_dim, num_classes) + self.feat_dim = feat_dim * 2 + else: + self.feat_dim = feat_dim + + def init_scale_factors(self): + """Initialize scale factors for STN.""" + self.scale_factors = [] + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + self.scale_factors.append( + torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float) + ) + + def stn(self, x, theta): + """Spatial transformer network.""" + grid = F.affine_grid(theta, x.size()) + x = F.grid_sample(x, grid) + return x + + def transform_theta(self, theta_i, region_idx): + """Transform theta for a given region.""" + scale_factors = self.scale_factors[region_idx] + theta = torch.zeros(theta_i.size(0), 2, 3) + theta[:, :, :2] = scale_factors + theta[:, :, -1] = theta_i + if self.use_gpu: + theta = theta.to(next(self.parameters()).device) + return theta + + def forward(self, x): + """Forward pass.""" + assert x.size(2) == 160 and x.size(3) == 64, ( + f"Input size does not match, expected (160, 64) but got ({x.size(2)}, {x.size(3)})" + ) + x = self.conv(x) + x1 = self.inception1(x) + x1_attn, x1_theta = self.ha1(x1) + x1_out = x1 * x1_attn + if self.learn_region: + x1_local_list = [] + for region_idx in range(4): + x1_theta_i = x1_theta[:, region_idx, :] + x1_theta_i = self.transform_theta(x1_theta_i, region_idx) + x1_trans_i = self.stn(x, x1_theta_i) + x1_trans_i = F.interpolate( + x1_trans_i, (24, 28), mode="bilinear", align_corners=True + ) + x1_local_i = self.local_conv1(x1_trans_i) + x1_local_list.append(x1_local_i) + x2 = self.inception2(x1_out) + x2_attn, x2_theta = self.ha2(x2) + x2_out = x2 * x2_attn + if self.learn_region: + x2_local_list = [] + for region_idx in range(4): + x2_theta_i = x2_theta[:, region_idx, :] + x2_theta_i = self.transform_theta(x2_theta_i, region_idx) + x2_trans_i = self.stn(x1_out, x2_theta_i) + x2_trans_i = F.interpolate( + x2_trans_i, (12, 14), mode="bilinear", align_corners=True + ) + x2_local_i = x2_trans_i + x1_local_list[region_idx] + x2_local_i = self.local_conv2(x2_local_i) + x2_local_list.append(x2_local_i) + x3 = self.inception3(x2_out) + x3_attn, x3_theta = self.ha3(x3) + x3_out = x3 * x3_attn + if self.learn_region: + x3_local_list = [] + for region_idx in range(4): + x3_theta_i = x3_theta[:, region_idx, :] + x3_theta_i = self.transform_theta(x3_theta_i, region_idx) + x3_trans_i = self.stn(x2_out, x3_theta_i) + x3_trans_i = F.interpolate( + x3_trans_i, (6, 7), mode="bilinear", align_corners=True + ) + x3_local_i = x3_trans_i + x2_local_list[region_idx] + x3_local_i = self.local_conv3(x3_local_i) + x3_local_list.append(x3_local_i) + x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view( + x3_out.size(0), x3_out.size(1) + ) + x_global = self.fc_global(x_global) + if self.learn_region: + x_local_list = [] + for region_idx in range(4): + x_local_i = x3_local_list[region_idx] + x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view( + x_local_i.size(0), -1 + ) + x_local_list.append(x_local_i) + x_local = torch.cat(x_local_list, 1) + x_local = self.fc_local(x_local) + if not self.training: + if self.learn_region: + x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True) + x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True) + return torch.cat([x_global, x_local], 1) + else: + return x_global + prelogits_global = self.classifier_global(x_global) + if self.learn_region: + prelogits_local = self.classifier_local(x_local) + if self.loss == "softmax": + if self.learn_region: + return (prelogits_global, prelogits_local) + else: + return prelogits_global + elif self.loss == "triplet": + if self.learn_region: + return (prelogits_global, prelogits_local), (x_global, x_local) + else: + return prelogits_global, x_global + else: + raise KeyError(f"Unsupported loss: {self.loss}") diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index 334bd1c8..3d04d003 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -1,7 +1,5 @@ - """MLFN backbone for person re-identification.""" -from __future__ import absolute_import, division import torch import torch.utils.model_zoo as model_zoo from torch import nn @@ -9,8 +7,8 @@ __all__ = ["mlfn"] model_urls = { - # training epoch = 5, top1 = 51.6 - "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", + # training epoch = 5, top1 = 51.6 + "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk", } diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py index 35a16219..b3e69186 100644 --- a/ethology/reid/backbones/mobilenetv2.py +++ b/ethology/reid/backbones/mobilenetv2.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import torch.utils.model_zoo as model_zoo from torch import nn @@ -9,239 +8,272 @@ __all__ = ["mobilenetv2_x1_0", "mobilenetv2_x1_4"] model_urls = { - # 1.0: top-1 71.3 - "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", - # 1.4: top-1 73.9 - "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", + # 1.0: top-1 71.3 + "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c", + # 1.4: top-1 73.9 + "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk", } class ConvBlock(nn.Module): - """Basic convolutional block. + """Basic convolutional block. - convolution (bias discarded) + batch normalization + relu6. + convolution (bias discarded) + batch normalization + relu6. - Args: - in_c (int): number of input channels. - out_c (int): number of output channels. - k (int or tuple): kernel size. - s (int or tuple): stride. - p (int or tuple): padding. - g (int): number of blocked connections from input channels - to output channels (default: 1). - """ + Args: + in_c (int): number of input channels. + out_c (int): number of output channels. + k (int or tuple): kernel size. + s (int or tuple): stride. + p (int or tuple): padding. + g (int): number of blocked connections from input channels + to output channels (default: 1). - def __init__(self, in_c, out_c, k, s=1, p=0, g=1): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) - self.bn = nn.BatchNorm2d(out_c) + """ - def forward(self, x): - return F.relu6(self.bn(self.conv(x))) + def __init__(self, in_c, out_c, k, s=1, p=0, g=1): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + in_c, out_c, k, stride=s, padding=p, bias=False, groups=g + ) + self.bn = nn.BatchNorm2d(out_c) + + def forward(self, x): + return F.relu6(self.bn(self.conv(x))) class Bottleneck(nn.Module): - def __init__(self, in_channels, out_channels, expansion_factor, stride=1): - super(Bottleneck, self).__init__() - mid_channels = in_channels * expansion_factor - self.use_residual = stride == 1 and in_channels == out_channels - self.conv1 = ConvBlock(in_channels, mid_channels, 1) - self.dwconv2 = ConvBlock( - mid_channels, mid_channels, 3, stride, 1, g=mid_channels - ) - self.conv3 = nn.Sequential( - nn.Conv2d(mid_channels, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels), - ) - - def forward(self, x): - m = self.conv1(x) - m = self.dwconv2(m) - m = self.conv3(m) - if self.use_residual: - return x + m - else: - return m + def __init__(self, in_channels, out_channels, expansion_factor, stride=1): + super(Bottleneck, self).__init__() + mid_channels = in_channels * expansion_factor + self.use_residual = stride == 1 and in_channels == out_channels + self.conv1 = ConvBlock(in_channels, mid_channels, 1) + self.dwconv2 = ConvBlock( + mid_channels, mid_channels, 3, stride, 1, g=mid_channels + ) + self.conv3 = nn.Sequential( + nn.Conv2d(mid_channels, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + m = self.conv1(x) + m = self.dwconv2(m) + m = self.conv3(m) + if self.use_residual: + return x + m + else: + return m class MobileNetV2(nn.Module): - """MobileNetV2. - - Reference: - Sandler et al. MobileNetV2: Inverted Residuals and - Linear Bottlenecks. CVPR 2018. - - Public keys: - - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. - - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. - """ - - def __init__( - self, - num_classes, - width_mult=1, - loss="softmax", - fc_dims=None, - dropout_p=None, - **kwargs, - ): - super(MobileNetV2, self).__init__() - self.loss = loss - self.in_channels = int(32 * width_mult) - self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 - - # construct layers - self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) - self.conv2 = self._make_layer(Bottleneck, 1, int(16 * width_mult), 1, 1) - self.conv3 = self._make_layer(Bottleneck, 6, int(24 * width_mult), 2, 2) - self.conv4 = self._make_layer(Bottleneck, 6, int(32 * width_mult), 3, 2) - self.conv5 = self._make_layer(Bottleneck, 6, int(64 * width_mult), 4, 2) - self.conv6 = self._make_layer(Bottleneck, 6, int(96 * width_mult), 3, 1) - self.conv7 = self._make_layer(Bottleneck, 6, int(160 * width_mult), 3, 2) - self.conv8 = self._make_layer(Bottleneck, 6, int(320 * width_mult), 1, 1) - self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) - - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(fc_dims, self.feature_dim, dropout_p) - self.classifier = nn.Linear(self.feature_dim, num_classes) - - self._init_params() - - def _make_layer(self, block, t, c, n, s): - # t: expansion factor - # c: output channels - # n: number of blocks - # s: stride for first layer - layers = [] - layers.append(block(self.in_channels, c, t, s)) - self.in_channels = c - for i in range(1, n): - layers.append(block(self.in_channels, c, t)) - return nn.Sequential(*layers) - - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - """Constructs fully connected layer. - - Args: - fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed - input_dim (int): input dimension - dropout_p (float): dropout probability, if None, dropout is unused - """ - if fc_dims is None: - self.feature_dim = input_dim - return None - - assert isinstance( - fc_dims, (list, tuple) - ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims)) - - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - - self.feature_dim = fc_dims[-1] - - return nn.Sequential(*layers) - - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - def featuremaps(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - x = self.conv6(x) - x = self.conv7(x) - x = self.conv8(x) - x = self.conv9(x) - return x - - def forward(self, x): - f = self.featuremaps(x) - v = self.global_avgpool(f) - v = v.view(v.size(0), -1) - - if self.fc is not None: - v = self.fc(v) - - if not self.training: - return v - - y = self.classifier(v) - - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + """MobileNetV2. + + Reference: + Sandler et al. MobileNetV2: Inverted Residuals and + Linear Bottlenecks. CVPR 2018. + + Public keys: + - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. + - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. + """ + + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super(MobileNetV2, self).__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + + # construct layers + self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) + self.conv2 = self._make_layer( + Bottleneck, 1, int(16 * width_mult), 1, 1 + ) + self.conv3 = self._make_layer( + Bottleneck, 6, int(24 * width_mult), 2, 2 + ) + self.conv4 = self._make_layer( + Bottleneck, 6, int(32 * width_mult), 3, 2 + ) + self.conv5 = self._make_layer( + Bottleneck, 6, int(64 * width_mult), 4, 2 + ) + self.conv6 = self._make_layer( + Bottleneck, 6, int(96 * width_mult), 3, 1 + ) + self.conv7 = self._make_layer( + Bottleneck, 6, int(160 * width_mult), 3, 2 + ) + self.conv8 = self._make_layer( + Bottleneck, 6, int(320 * width_mult), 1, 1 + ) + self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1) + + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + fc_dims, self.feature_dim, dropout_p + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + + self._init_params() + + def _make_layer(self, block, t, c, n, s): + # t: expansion factor + # c: output channels + # n: number of blocks + # s: stride for first layer + layers = [] + layers.append(block(self.in_channels, c, t, s)) + self.in_channels = c + for i in range(1, n): + layers.append(block(self.in_channels, c, t)) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + """Constructs fully connected layer. + + Args: + fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed + input_dim (int): input dimension + dropout_p (float): dropout probability, if None, dropout is unused + + """ + if fc_dims is None: + self.feature_dim = input_dim + return None + + assert isinstance(fc_dims, (list, tuple)), ( + f"fc_dims must be either list or tuple, but got {type(fc_dims)}" + ) + + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + + self.feature_dim = fc_dims[-1] + + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d) or isinstance( + m, nn.BatchNorm1d + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + x = self.conv6(x) + x = self.conv7(x) + x = self.conv8(x) + x = self.conv9(x) + return x + + def forward(self, x): + f = self.featuremaps(x) + v = self.global_avgpool(f) + v = v.view(v.size(0), -1) + + if self.fc is not None: + v = self.fc(v) + + if not self.training: + return v + + y = self.classifier(v) + + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") def init_pretrained_weights(model, model_url): - """Initializes model with pretrained weights. + """Initializes model with pretrained weights. - Layers that don't match with pretrained layers in name or size are kept unchanged. - """ - pretrain_dict = model_zoo.load_url(model_url) - model_dict = model.state_dict() - pretrain_dict = { - k: v - for k, v in pretrain_dict.items() - if k in model_dict and model_dict[k].size() == v.size() - } - model_dict.update(pretrain_dict) - model.load_state_dict(model_dict) + Layers that don't match with pretrained layers in name or size are kept unchanged. + """ + pretrain_dict = model_zoo.load_url(model_url) + model_dict = model.state_dict() + pretrain_dict = { + k: v + for k, v in pretrain_dict.items() + if k in model_dict and model_dict[k].size() == v.size() + } + model_dict.update(pretrain_dict) + model.load_state_dict(model_dict) def mobilenetv2_x1_0(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, loss=loss, width_mult=1, fc_dims=None, dropout_p=None, **kwargs - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_0"] - ) - ) - return model + model = MobileNetV2( + num_classes, + loss=loss, + width_mult=1, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_0"] + ) + ) + return model def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs): - model = MobileNetV2( - num_classes, loss=loss, width_mult=1.4, fc_dims=None, dropout_p=None, **kwargs - ) - if pretrained: - # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) - import warnings - - warnings.warn( - "The imagenet pretrained weights need to be manually downloaded from {}".format( - model_urls["mobilenetv2_x1_4"] - ) - ) - return model + model = MobileNetV2( + num_classes, + loss=loss, + width_mult=1.4, + fc_dims=None, + dropout_p=None, + **kwargs, + ) + if pretrained: + # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4']) + import warnings + + warnings.warn( + "The imagenet pretrained weights need to be manually downloaded from {}".format( + model_urls["mobilenetv2_x1_4"] + ) + ) + return model + + # Copied from boxmot/boxmot/reid/backbones/mobilenetv2.py diff --git a/ethology/reid/backbones/osnet.py b/ethology/reid/backbones/osnet.py index c07e4e45..c13dd5b7 100644 --- a/ethology/reid/backbones/osnet.py +++ b/ethology/reid/backbones/osnet.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import warnings @@ -8,338 +7,529 @@ from torch import nn from torch.nn import functional as F -__all__ = ["osnet_x1_0", "osnet_x0_75", "osnet_x0_5", "osnet_x0_25", "osnet_ibn_x1_0"] +__all__ = [ + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", +] pretrained_urls = { - "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", - "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", - "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", - "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", - "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", + "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY", + "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq", + "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i", + "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs", + "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, ChannelGate, OSBlock... + class ConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + groups=1, + IN=False, + ): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + groups=groups, + ) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=stride, + padding=0, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) - self.bn = nn.BatchNorm2d(out_channels) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return x + def __init__(self, in_channels, out_channels, stride=1): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) - self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - x = self.relu(x) - return x + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, 1, stride=1, padding=0, bias=False + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=False, + groups=out_channels, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + x = self.relu(x) + return x + class ChannelGate(nn.Module): - def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU(inplace=True) - self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU(inplace=True) - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x + def __init__( + self, + in_channels, + num_gates=None, + return_gates=False, + gate_activation="sigmoid", + reduction=16, + layer_norm=False, + ): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + in_channels, + in_channels // reduction, + kernel_size=1, + bias=True, + padding=0, + ) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d( + in_channels // reduction, + num_gates, + kernel_size=1, + bias=True, + padding=0, + ) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU(inplace=True) + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError(f"Unknown gate activation: {gate_activation}") + + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + class OSBlock(nn.Module): - def __init__(self, in_channels, out_channels, IN=False, bottleneck_reduction=4, **kwargs): - super(OSBlock, self).__init__() - mid_channels = out_channels // bottleneck_reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2a = LightConv3x3(mid_channels, mid_channels) - self.conv2b = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2c = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.conv2d = nn.Sequential( - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - LightConv3x3(mid_channels, mid_channels), - ) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = None - if IN: - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2a = self.conv2a(x1) - x2b = self.conv2b(x1) - x2c = self.conv2c(x1) - x2d = self.conv2d(x1) - x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - if self.IN is not None: - out = self.IN(out) - return F.relu(out) + def __init__( + self, + in_channels, + out_channels, + IN=False, + bottleneck_reduction=4, + **kwargs, + ): + super(OSBlock, self).__init__() + mid_channels = out_channels // bottleneck_reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2a = LightConv3x3(mid_channels, mid_channels) + self.conv2b = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2c = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.conv2d = nn.Sequential( + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + LightConv3x3(mid_channels, mid_channels), + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = None + if IN: + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2a = self.conv2a(x1) + x2b = self.conv2b(x1) + x2c = self.conv2c(x1) + x2d = self.conv2d(x1) + x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + if self.IN is not None: + out = self.IN(out) + return F.relu(out) + class OSNet(nn.Module): - def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", IN=False, **kwargs): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1], reduce_spatial_size=True, IN=IN) - self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2], reduce_spatial_size=True) - self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3], reduce_spatial_size=False) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - def _make_layer(self, block, layer, in_channels, out_channels, reduce_spatial_size, IN=False): - layers = [] - layers.append(block(in_channels, out_channels, IN=IN)) - for i in range(1, layer): - layers.append(block(out_channels, out_channels, IN=IN)) - if reduce_spatial_size: - layers.append(nn.Sequential(Conv1x1(out_channels, out_channels), nn.AvgPool2d(2, stride=2))) - return nn.Sequential(*layers) - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU(inplace=True)) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + blocks, + layers, + channels, + feature_dim=512, + loss="softmax", + IN=False, + **kwargs, + ): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer( + blocks[0], + layers[0], + channels[0], + channels[1], + reduce_spatial_size=True, + IN=IN, + ) + self.conv3 = self._make_layer( + blocks[1], + layers[1], + channels[1], + channels[2], + reduce_spatial_size=True, + ) + self.conv4 = self._make_layer( + blocks[2], + layers[2], + channels[2], + channels[3], + reduce_spatial_size=False, + ) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + self.feature_dim, channels[3], dropout_p=None + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + + def _make_layer( + self, + block, + layer, + in_channels, + out_channels, + reduce_spatial_size, + IN=False, + ): + layers = [] + layers.append(block(in_channels, out_channels, IN=IN)) + for i in range(1, layer): + layers.append(block(out_channels, out_channels, IN=IN)) + if reduce_spatial_size: + layers.append( + nn.Sequential( + Conv1x1(out_channels, out_channels), + nn.AvgPool2d(2, stride=2), + ) + ) + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU(inplace=True)) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d) or isinstance( + m, nn.BatchNorm1d + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") + def init_pretrained_weights(model, key=""): - import errno - import os - from collections import OrderedDict - import gdown - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), - ) - ) - return torch_home - filename = key + "_imagenet.pth" - # Try ethology/models/ directory first - ethology_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) - models_dir = os.path.join(ethology_root, "models") - os.makedirs(models_dir, exist_ok=True) - local_file = os.path.join(models_dir, filename) - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - os.makedirs(model_dir, exist_ok=True) - cached_file = os.path.join(model_dir, filename) - # Prefer ethology/models/ directory file if present - if os.path.exists(local_file): - print(f"[OSNet] Loading model weights from {local_file}") - cached_file = local_file - elif os.path.exists(cached_file): - print(f"[OSNet] Loading model weights from {cached_file}") - else: - print(f"[OSNet] Downloading model weights to {cached_file}") - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights from "{}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)".format(cached_file) - ) - else: - print( - 'Successfully loaded imagenet pretrained weights from "{}"'.format( - cached_file - ) - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - "due to unmatched keys or layer size: {}".format(discarded_layers) - ) + import os + from collections import OrderedDict + + import gdown + + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" + ), + ) + ) + return torch_home + + filename = key + "_imagenet.pth" + # Try ethology/models/ directory first + ethology_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../../") + ) + models_dir = os.path.join(ethology_root, "models") + os.makedirs(models_dir, exist_ok=True) + local_file = os.path.join(models_dir, filename) + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + os.makedirs(model_dir, exist_ok=True) + cached_file = os.path.join(model_dir, filename) + # Prefer ethology/models/ directory file if present + if os.path.exists(local_file): + print(f"[OSNet] Loading model weights from {local_file}") + cached_file = local_file + elif os.path.exists(cached_file): + print(f"[OSNet] Loading model weights from {cached_file}") + else: + print(f"[OSNet] Downloading model weights to {cached_file}") + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + f'The pretrained weights from "{cached_file}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)" + ) + else: + print( + f'Successfully loaded imagenet pretrained weights from "{cached_file}"' + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + f"due to unmatched keys or layer size: {discarded_layers}" + ) + def osnet_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x1_0") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x1_0") + return model + def osnet_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_75") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[48, 192, 288, 384], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_75") + return model + def osnet_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_5") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[32, 128, 192, 256], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_5") + return model + def osnet_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_x0_25") - return model - -def osnet_ibn_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[OSBlock, OSBlock, OSBlock], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ibn_x1_0") - return model + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[16, 64, 96, 128], + loss=loss, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_x0_25") + return model + + +def osnet_ibn_x1_0( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[OSBlock, OSBlock, OSBlock], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ibn_x1_0") + return model diff --git a/ethology/reid/backbones/osnet_ain.py b/ethology/reid/backbones/osnet_ain.py index 9e052209..2ef3da25 100644 --- a/ethology/reid/backbones/osnet_ain.py +++ b/ethology/reid/backbones/osnet_ain.py @@ -1,6 +1,5 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -from __future__ import absolute_import, division import warnings @@ -8,349 +7,541 @@ from torch import nn from torch.nn import functional as F -__all__ = ["osnet_ain_x1_0", "osnet_ain_x0_75", "osnet_ain_x0_5", "osnet_ain_x0_25"] +__all__ = [ + "osnet_ain_x1_0", + "osnet_ain_x0_75", + "osnet_ain_x0_5", + "osnet_ain_x0_25", +] pretrained_urls = { - "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", - "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", - "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", - "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", + "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo", + "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM", + "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l", + "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt", } # ...existing code for ConvLayer, Conv1x1, Conv1x1Linear, Conv3x3, LightConv3x3, LightConvStream, ChannelGate, OSBlock, OSBlockINin, OSNet, init_pretrained_weights, and instantiation functions... + class ConvLayer(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1, IN=False): - super(ConvLayer, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, groups=groups) - if IN: - self.bn = nn.InstanceNorm2d(out_channels, affine=True) - else: - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + groups=1, + IN=False, + ): + super(ConvLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + groups=groups, + ) + if IN: + self.bn = nn.InstanceNorm2d(out_channels, affine=True) + else: + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class Conv1x1(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv1x1, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv1x1, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=stride, + padding=0, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class Conv1x1Linear(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, bn=True): - super(Conv1x1Linear, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=stride, padding=0, bias=False) - self.bn = None - if bn: - self.bn = nn.BatchNorm2d(out_channels) - def forward(self, x): - x = self.conv(x) - if self.bn is not None: - x = self.bn(x) - return x + def __init__(self, in_channels, out_channels, stride=1, bn=True): + super(Conv1x1Linear, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=stride, padding=0, bias=False + ) + self.bn = None + if bn: + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + return x + class Conv3x3(nn.Module): - def __init__(self, in_channels, out_channels, stride=1, groups=1): - super(Conv3x3, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False, groups=groups) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels, stride=1, groups=1): + super(Conv3x3, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return self.relu(x) + class LightConv3x3(nn.Module): - def __init__(self, in_channels, out_channels): - super(LightConv3x3, self).__init__() - self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False) - self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False, groups=out_channels) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU() - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.bn(x) - return self.relu(x) + def __init__(self, in_channels, out_channels): + super(LightConv3x3, self).__init__() + self.conv1 = nn.Conv2d( + in_channels, out_channels, 1, stride=1, padding=0, bias=False + ) + self.conv2 = nn.Conv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=False, + groups=out_channels, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.bn(x) + return self.relu(x) + class LightConvStream(nn.Module): - def __init__(self, in_channels, out_channels, depth): - super(LightConvStream, self).__init__() - assert depth >= 1 - layers = [LightConv3x3(in_channels, out_channels)] - for i in range(depth - 1): - layers.append(LightConv3x3(out_channels, out_channels)) - self.layers = nn.Sequential(*layers) - def forward(self, x): - return self.layers(x) + def __init__(self, in_channels, out_channels, depth): + super(LightConvStream, self).__init__() + assert depth >= 1 + layers = [LightConv3x3(in_channels, out_channels)] + for i in range(depth - 1): + layers.append(LightConv3x3(out_channels, out_channels)) + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + class ChannelGate(nn.Module): - def __init__(self, in_channels, num_gates=None, return_gates=False, gate_activation="sigmoid", reduction=16, layer_norm=False): - super(ChannelGate, self).__init__() - if num_gates is None: - num_gates = in_channels - self.return_gates = return_gates - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc1 = nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0) - self.norm1 = None - if layer_norm: - self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) - self.relu = nn.ReLU() - self.fc2 = nn.Conv2d(in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0) - if gate_activation == "sigmoid": - self.gate_activation = nn.Sigmoid() - elif gate_activation == "relu": - self.gate_activation = nn.ReLU() - elif gate_activation == "linear": - self.gate_activation = None - else: - raise RuntimeError("Unknown gate activation: {}".format(gate_activation)) - def forward(self, x): - input = x - x = self.global_avgpool(x) - x = self.fc1(x) - if self.norm1 is not None: - x = self.norm1(x) - x = self.relu(x) - x = self.fc2(x) - if self.gate_activation is not None: - x = self.gate_activation(x) - if self.return_gates: - return x - return input * x + def __init__( + self, + in_channels, + num_gates=None, + return_gates=False, + gate_activation="sigmoid", + reduction=16, + layer_norm=False, + ): + super(ChannelGate, self).__init__() + if num_gates is None: + num_gates = in_channels + self.return_gates = return_gates + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc1 = nn.Conv2d( + in_channels, + in_channels // reduction, + kernel_size=1, + bias=True, + padding=0, + ) + self.norm1 = None + if layer_norm: + self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1)) + self.relu = nn.ReLU() + self.fc2 = nn.Conv2d( + in_channels // reduction, + num_gates, + kernel_size=1, + bias=True, + padding=0, + ) + if gate_activation == "sigmoid": + self.gate_activation = nn.Sigmoid() + elif gate_activation == "relu": + self.gate_activation = nn.ReLU() + elif gate_activation == "linear": + self.gate_activation = None + else: + raise RuntimeError(f"Unknown gate activation: {gate_activation}") + + def forward(self, x): + input = x + x = self.global_avgpool(x) + x = self.fc1(x) + if self.norm1 is not None: + x = self.norm1(x) + x = self.relu(x) + x = self.fc2(x) + if self.gate_activation is not None: + x = self.gate_activation(x) + if self.return_gates: + return x + return input * x + class OSBlock(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlock, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlock, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList( + [ + LightConvStream(mid_channels, mid_channels, t) + for t in range(1, T + 1) + ] + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + class OSBlockINin(nn.Module): - def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): - super(OSBlockINin, self).__init__() - assert T >= 1 - assert out_channels >= reduction and out_channels % reduction == 0 - mid_channels = out_channels // reduction - self.conv1 = Conv1x1(in_channels, mid_channels) - self.conv2 = nn.ModuleList([LightConvStream(mid_channels, mid_channels, t) for t in range(1, T + 1)]) - self.gate = ChannelGate(mid_channels) - self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) - self.downsample = None - if in_channels != out_channels: - self.downsample = Conv1x1Linear(in_channels, out_channels) - self.IN = nn.InstanceNorm2d(out_channels, affine=True) - def forward(self, x): - identity = x - x1 = self.conv1(x) - x2 = 0 - for conv2_t in self.conv2: - x2_t = conv2_t(x1) - x2 = x2 + self.gate(x2_t) - x3 = self.conv3(x2) - x3 = self.IN(x3) - if self.downsample is not None: - identity = self.downsample(identity) - out = x3 + identity - return F.relu(out) + def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs): + super(OSBlockINin, self).__init__() + assert T >= 1 + assert out_channels >= reduction and out_channels % reduction == 0 + mid_channels = out_channels // reduction + self.conv1 = Conv1x1(in_channels, mid_channels) + self.conv2 = nn.ModuleList( + [ + LightConvStream(mid_channels, mid_channels, t) + for t in range(1, T + 1) + ] + ) + self.gate = ChannelGate(mid_channels) + self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False) + self.downsample = None + if in_channels != out_channels: + self.downsample = Conv1x1Linear(in_channels, out_channels) + self.IN = nn.InstanceNorm2d(out_channels, affine=True) + + def forward(self, x): + identity = x + x1 = self.conv1(x) + x2 = 0 + for conv2_t in self.conv2: + x2_t = conv2_t(x1) + x2 = x2 + self.gate(x2_t) + x3 = self.conv3(x2) + x3 = self.IN(x3) + if self.downsample is not None: + identity = self.downsample(identity) + out = x3 + identity + return F.relu(out) + class OSNet(nn.Module): - def __init__(self, num_classes, blocks, layers, channels, feature_dim=512, loss="softmax", conv1_IN=False, **kwargs): - super(OSNet, self).__init__() - num_blocks = len(blocks) - assert num_blocks == len(layers) - assert num_blocks == len(channels) - 1 - self.loss = loss - self.feature_dim = feature_dim - self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=conv1_IN) - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1]) - self.pool2 = nn.Sequential(Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2)) - self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2]) - self.pool3 = nn.Sequential(Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2)) - self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3]) - self.conv5 = Conv1x1(channels[3], channels[3]) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.fc = self._construct_fc_layer(self.feature_dim, channels[3], dropout_p=None) - self.classifier = nn.Linear(self.feature_dim, num_classes) - self._init_params() - def _make_layer(self, blocks, layer, in_channels, out_channels): - layers = [] - layers += [blocks[0](in_channels, out_channels)] - for i in range(1, len(blocks)): - layers += [blocks[i](out_channels, out_channels)] - return nn.Sequential(*layers) - def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): - if fc_dims is None or fc_dims < 0: - self.feature_dim = input_dim - return None - if isinstance(fc_dims, int): - fc_dims = [fc_dims] - layers = [] - for dim in fc_dims: - layers.append(nn.Linear(input_dim, dim)) - layers.append(nn.BatchNorm1d(dim)) - layers.append(nn.ReLU()) - if dropout_p is not None: - layers.append(nn.Dropout(p=dropout_p)) - input_dim = dim - self.feature_dim = fc_dims[-1] - return nn.Sequential(*layers) - def _init_params(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm1d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.InstanceNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.Linear): - nn.init.normal_(m.weight, 0, 0.01) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - def featuremaps(self, x): - x = self.conv1(x) - x = self.maxpool(x) - x = self.conv2(x) - x = self.pool2(x) - x = self.conv3(x) - x = self.pool3(x) - x = self.conv4(x) - x = self.conv5(x) - return x - def forward(self, x, return_featuremaps=False): - x = self.featuremaps(x) - if return_featuremaps: - return x - v = self.global_avgpool(x) - v = v.view(v.size(0), -1) - if self.fc is not None: - v = self.fc(v) - if not self.training: - return v - y = self.classifier(v) - if self.loss == "softmax": - return y - elif self.loss == "triplet": - return y, v - else: - raise KeyError("Unsupported loss: {}".format(self.loss)) + def __init__( + self, + num_classes, + blocks, + layers, + channels, + feature_dim=512, + loss="softmax", + conv1_IN=False, + **kwargs, + ): + super(OSNet, self).__init__() + num_blocks = len(blocks) + assert num_blocks == len(layers) + assert num_blocks == len(channels) - 1 + self.loss = loss + self.feature_dim = feature_dim + self.conv1 = ConvLayer( + 3, channels[0], 7, stride=2, padding=3, IN=conv1_IN + ) + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = self._make_layer( + blocks[0], layers[0], channels[0], channels[1] + ) + self.pool2 = nn.Sequential( + Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2) + ) + self.conv3 = self._make_layer( + blocks[1], layers[1], channels[1], channels[2] + ) + self.pool3 = nn.Sequential( + Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2) + ) + self.conv4 = self._make_layer( + blocks[2], layers[2], channels[2], channels[3] + ) + self.conv5 = Conv1x1(channels[3], channels[3]) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = self._construct_fc_layer( + self.feature_dim, channels[3], dropout_p=None + ) + self.classifier = nn.Linear(self.feature_dim, num_classes) + self._init_params() + + def _make_layer(self, blocks, layer, in_channels, out_channels): + layers = [] + layers += [blocks[0](in_channels, out_channels)] + for i in range(1, len(blocks)): + layers += [blocks[i](out_channels, out_channels)] + return nn.Sequential(*layers) + + def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): + if fc_dims is None or fc_dims < 0: + self.feature_dim = input_dim + return None + if isinstance(fc_dims, int): + fc_dims = [fc_dims] + layers = [] + for dim in fc_dims: + layers.append(nn.Linear(input_dim, dim)) + layers.append(nn.BatchNorm1d(dim)) + layers.append(nn.ReLU()) + if dropout_p is not None: + layers.append(nn.Dropout(p=dropout_p)) + input_dim = dim + self.feature_dim = fc_dims[-1] + return nn.Sequential(*layers) + + def _init_params(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif ( + isinstance(m, nn.BatchNorm2d) + or isinstance(m, nn.BatchNorm1d) + or isinstance(m, nn.InstanceNorm2d) + ): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def featuremaps(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.conv2(x) + x = self.pool2(x) + x = self.conv3(x) + x = self.pool3(x) + x = self.conv4(x) + x = self.conv5(x) + return x + + def forward(self, x, return_featuremaps=False): + x = self.featuremaps(x) + if return_featuremaps: + return x + v = self.global_avgpool(x) + v = v.view(v.size(0), -1) + if self.fc is not None: + v = self.fc(v) + if not self.training: + return v + y = self.classifier(v) + if self.loss == "softmax": + return y + elif self.loss == "triplet": + return y, v + else: + raise KeyError(f"Unsupported loss: {self.loss}") + def init_pretrained_weights(model, key=""): - import errno - import os - from collections import OrderedDict - import gdown - def _get_torch_home(): - ENV_TORCH_HOME = "TORCH_HOME" - ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" - DEFAULT_CACHE_DIR = "~/.cache" - torch_home = os.path.expanduser( - os.getenv( - ENV_TORCH_HOME, - os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"), - ) - ) - return torch_home - torch_home = _get_torch_home() - model_dir = os.path.join(torch_home, "checkpoints") - try: - os.makedirs(model_dir) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - filename = key + "_imagenet.pth" - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - gdown.download(pretrained_urls[key], cached_file, quiet=False) - state_dict = torch.load(cached_file) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - if k.startswith("module."): - k = k[7:] - if k in model_dict and model_dict[k].size() == v.size(): - new_state_dict[k] = v - matched_layers.append(k) - else: - discarded_layers.append(k) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) - if len(matched_layers) == 0: - warnings.warn( - 'The pretrained weights from "{}" cannot be loaded, ' - "please check the key names manually " - "(** ignored and continue **)".format(cached_file) - ) - else: - print( - 'Successfully loaded imagenet pretrained weights from "{}"'.format( - cached_file - ) - ) - if len(discarded_layers) > 0: - print( - "** The following layers are discarded " - "due to unmatched keys or layer size: {}".format(discarded_layers) - ) - -def osnet_ain_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[64, 256, 384, 512], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x1_0") - return model - -def osnet_ain_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[48, 192, 288, 384], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_75") - return model - -def osnet_ain_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[32, 128, 192, 256], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_5") - return model - -def osnet_ain_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs): - model = OSNet(num_classes, blocks=[[OSBlockINin, OSBlockINin], [OSBlock, OSBlockINin], [OSBlockINin, OSBlock]], layers=[2, 2, 2], channels=[16, 64, 96, 128], loss=loss, conv1_IN=True, **kwargs) - if pretrained: - init_pretrained_weights(model, key="osnet_ain_x0_25") - return model + import errno + import os + from collections import OrderedDict + + import gdown + + def _get_torch_home(): + ENV_TORCH_HOME = "TORCH_HOME" + ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" + DEFAULT_CACHE_DIR = "~/.cache" + torch_home = os.path.expanduser( + os.getenv( + ENV_TORCH_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch" + ), + ) + ) + return torch_home + + torch_home = _get_torch_home() + model_dir = os.path.join(torch_home, "checkpoints") + try: + os.makedirs(model_dir) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + filename = key + "_imagenet.pth" + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file): + gdown.download(pretrained_urls[key], cached_file, quiet=False) + state_dict = torch.load(cached_file) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + if k.startswith("module."): + k = k[7:] + if k in model_dict and model_dict[k].size() == v.size(): + new_state_dict[k] = v + matched_layers.append(k) + else: + discarded_layers.append(k) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) + if len(matched_layers) == 0: + warnings.warn( + f'The pretrained weights from "{cached_file}" cannot be loaded, ' + "please check the key names manually " + "(** ignored and continue **)" + ) + else: + print( + f'Successfully loaded imagenet pretrained weights from "{cached_file}"' + ) + if len(discarded_layers) > 0: + print( + "** The following layers are discarded " + f"due to unmatched keys or layer size: {discarded_layers}" + ) + + +def osnet_ain_x1_0( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[64, 256, 384, 512], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x1_0") + return model + + +def osnet_ain_x0_75( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[48, 192, 288, 384], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_75") + return model + + +def osnet_ain_x0_5( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[32, 128, 192, 256], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_5") + return model + + +def osnet_ain_x0_25( + num_classes=1000, pretrained=True, loss="softmax", **kwargs +): + model = OSNet( + num_classes, + blocks=[ + [OSBlockINin, OSBlockINin], + [OSBlock, OSBlockINin], + [OSBlockINin, OSBlock], + ], + layers=[2, 2, 2], + channels=[16, 64, 96, 128], + loss=loss, + conv1_IN=True, + **kwargs, + ) + if pretrained: + init_pretrained_weights(model, key="osnet_ain_x0_25") + return model diff --git a/ethology/reid/backends/base_backend.py b/ethology/reid/backends/base_backend.py index 688ec43a..0edb2826 100644 --- a/ethology/reid/backends/base_backend.py +++ b/ethology/reid/backends/base_backend.py @@ -1,18 +1,19 @@ - -import os from abc import abstractmethod from pathlib import Path + import cv2 import gdown import numpy as np import torch from filelock import SoftFileLock + from ethology.reid.core.registry import ReIDModelRegistry + # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.checks import RequirementsChecker # If needed, implement or set RequirementsChecker -class BaseModelBackend: +class BaseModelBackend: def __init__(self, weights, device, half): self.weights = weights[0] if isinstance(weights, list) else weights if isinstance(self.weights, str): @@ -22,7 +23,7 @@ def __init__(self, weights, device, half): self.half = half self.model = None # Support both string and torch.device for device - if hasattr(self.device, 'type'): + if hasattr(self.device, "type"): self.cuda = torch.cuda.is_available() and self.device.type != "cpu" else: self.cuda = torch.cuda.is_available() and self.device != "cpu" @@ -41,11 +42,19 @@ def __init__(self, weights, device, half): self.load_model(self.weights) - self.mean_array = torch.tensor([0.485, 0.456, 0.406], device=self.device).view(1, 3, 1, 1) - self.std_array = torch.tensor([0.229, 0.224, 0.225], device=self.device).view(1, 3, 1, 1) + self.mean_array = torch.tensor( + [0.485, 0.456, 0.406], device=self.device + ).view(1, 3, 1, 1) + self.std_array = torch.tensor( + [0.229, 0.224, 0.225], device=self.device + ).view(1, 3, 1, 1) if "clip" in self.model_name: - self.mean_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) - self.std_array = torch.tensor([0.5, 0.5, 0.5], device=self.device).view(1, 3, 1, 1) + self.mean_array = torch.tensor( + [0.5, 0.5, 0.5], device=self.device + ).view(1, 3, 1, 1) + self.std_array = torch.tensor( + [0.5, 0.5, 0.5], device=self.device + ).view(1, 3, 1, 1) if "vehicleid" in self.weights.name or "veri" in self.weights.name: input_shape = (256, 256) @@ -57,7 +66,6 @@ def __init__(self, weights, device, half): input_shape = (256, 128) self.input_shape = input_shape - def get_crops(self, xyxys, img): h, w = img.shape[:2] interpolation_method = cv2.INTER_LINEAR @@ -85,7 +93,6 @@ def get_crops(self, xyxys, img): crops = (crops - self.mean_array) / self.std_array return crops - @torch.no_grad() def get_features(self, xyxys, img): if xyxys.size != 0: @@ -98,7 +105,6 @@ def get_features(self, xyxys, img): features = features / np.linalg.norm(features, axis=-1, keepdims=True) return features - def warmup(self, imgsz=[(256, 128, 3)]): if self.device.type != "cpu": im = np.random.randint(0, 255, *imgsz, dtype=np.uint8) @@ -108,11 +114,9 @@ def warmup(self, imgsz=[(256, 128, 3)]): crops = self.inference_preprocess(crops) self.forward(crops) - def to_numpy(self, x): return x.cpu().numpy() if isinstance(x, torch.Tensor) else x - def inference_preprocess(self, x): if self.half: if isinstance(x, torch.Tensor): @@ -121,32 +125,34 @@ def inference_preprocess(self, x): elif isinstance(x, np.ndarray): if x.dtype != np.float16: x = x.astype(np.float16) - if hasattr(self, 'nhwc') and self.nhwc: + if hasattr(self, "nhwc") and self.nhwc: if isinstance(x, torch.Tensor): x = x.permute(0, 2, 3, 1) elif isinstance(x, np.ndarray): x = np.transpose(x, (0, 2, 3, 1)) return x - def inference_postprocess(self, features): if isinstance(features, (list, tuple)): return ( - self.to_numpy(features[0]) if len(features) == 1 else [self.to_numpy(x) for x in features] + self.to_numpy(features[0]) + if len(features) == 1 + else [self.to_numpy(x) for x in features] ) else: return self.to_numpy(features) - @abstractmethod def forward(self, im_batch): - raise NotImplementedError("This method should be implemented by subclasses.") - + raise NotImplementedError( + "This method should be implemented by subclasses." + ) @abstractmethod def load_model(self, w): - raise NotImplementedError("This method should be implemented by subclasses.") - + raise NotImplementedError( + "This method should be implemented by subclasses." + ) def download_model(self, w): if isinstance(w, str): diff --git a/ethology/reid/backends/onnx_backend.py b/ethology/reid/backends/onnx_backend.py index c7c93017..41aefb7f 100644 --- a/ethology/reid/backends/onnx_backend.py +++ b/ethology/reid/backends/onnx_backend.py @@ -1,31 +1,34 @@ - from ethology.reid.backends.base_backend import BaseModelBackend + class ONNXBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # ONNXRuntime will attempt to use the first provider, and if it fails or is not + # available for some reason, it will fall back to the next provider in the list + if self.device.type == "mps": + # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) + providers = ["MPSExecutionProvider", "CPUExecutionProvider"] + elif self.device.type == "cuda": + # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + else: + # self.checker.check_packages(("onnxruntime==1.18.1",)) + providers = ["CPUExecutionProvider"] + import onnxruntime - def load_model(self, w): - # ONNXRuntime will attempt to use the first provider, and if it fails or is not - # available for some reason, it will fall back to the next provider in the list - if self.device.type == "mps": - # self.checker.check_packages(("onnxruntime-silicon==1.18.1",)) - providers = ["MPSExecutionProvider", "CPUExecutionProvider"] - elif self.device.type == "cuda": - # self.checker.check_packages(("onnxruntime-gpu==1.18.1",)) - providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] - else: - # self.checker.check_packages(("onnxruntime==1.18.1",)) - providers = ["CPUExecutionProvider"] - import onnxruntime - self.session = onnxruntime.InferenceSession(str(w), providers=providers) + self.session = onnxruntime.InferenceSession( + str(w), providers=providers + ) - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() - features = self.session.run( - [self.session.get_outputs()[0].name], - {self.session.get_inputs()[0].name: im_batch}, - )[0] - return features + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() + features = self.session.run( + [self.session.get_outputs()[0].name], + {self.session.get_inputs()[0].name: im_batch}, + )[0] + return features diff --git a/ethology/reid/backends/openvino_backend.py b/ethology/reid/backends/openvino_backend.py index f06392bf..0c56a06e 100644 --- a/ethology/reid/backends/openvino_backend.py +++ b/ethology/reid/backends/openvino_backend.py @@ -1,48 +1,49 @@ from pathlib import Path from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed + class OpenVinoBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half + + def load_model(self, w): + # self.checker.check_packages(("openvino>=2025.2.0",)) + + print(f"Loading {w} for OpenVINO inference...") + try: + # requires openvino-dev: https://pypi.org/project/openvino-dev/ + from openvino import Core, Layout + except ImportError: + print( + f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" + "requires openvino pip package to be installed!\n" + "$ pip install openvino>=2025.2.0\n" + ) + raise + ie = Core() + w = Path(w) + print(w) + if w.suffix == ".bin": + w = w.with_suffix(".xml") + + if not w.is_file(): # if not *.xml + w = next( + Path(w).glob("*.xml") + ) # get *.xml file from *_openvino_model dir + network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) + if network.get_parameters()[0].get_layout().empty: + network.get_parameters()[0].set_layout(Layout("NCWH")) + self.executable_network = ie.compile_model( + network, device_name="CPU" + ) # device_name="MYRIAD" for Intel NCS2 + self.output_layer = next(iter(self.executable_network.outputs)) - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half - - def load_model(self, w): - # self.checker.check_packages(("openvino>=2025.2.0",)) - - print(f"Loading {w} for OpenVINO inference...") - try: - # requires openvino-dev: https://pypi.org/project/openvino-dev/ - from openvino import Core, Layout - except ImportError: - print( - f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n" - "requires openvino pip package to be installed!\n" - "$ pip install openvino>=2025.2.0\n" - ) - raise - ie = Core() - w = Path(w) - print(w) - if w.suffix == '.bin': - w = w.with_suffix('.xml') - - if not w.is_file(): # if not *.xml - w = next( - Path(w).glob("*.xml") - ) # get *.xml file from *_openvino_model dir - network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin")) - if network.get_parameters()[0].get_layout().empty: - network.get_parameters()[0].set_layout(Layout("NCWH")) - self.executable_network = ie.compile_model( - network, device_name="CPU" - ) # device_name="MYRIAD" for Intel NCS2 - self.output_layer = next(iter(self.executable_network.outputs)) - - def forward(self, im_batch): - im_batch = im_batch.cpu().numpy() # FP32 - features = self.executable_network([im_batch])[self.output_layer] - return features + def forward(self, im_batch): + im_batch = im_batch.cpu().numpy() # FP32 + features = self.executable_network([im_batch])[self.output_layer] + return features diff --git a/ethology/reid/backends/pytorch_backend.py b/ethology/reid/backends/pytorch_backend.py index d3dbfa06..2e859cc8 100644 --- a/ethology/reid/backends/pytorch_backend.py +++ b/ethology/reid/backends/pytorch_backend.py @@ -1,20 +1,20 @@ from ethology.reid.backends.base_backend import BaseModelBackend from ethology.reid.core.registry import ReIDModelRegistry -class PyTorchBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half +class PyTorchBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half - def load_model(self, w): - # Load a PyTorch model - if w and w.is_file(): - ReIDModelRegistry.load_pretrained_weights(self.model, w) - self.model.to(self.device).eval() - self.model.half() if self.half else self.model.float() + def load_model(self, w): + # Load a PyTorch model + if w and w.is_file(): + ReIDModelRegistry.load_pretrained_weights(self.model, w) + self.model.to(self.device).eval() + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/backends/tensorrt_backend.py b/ethology/reid/backends/tensorrt_backend.py index 8dd7d7ee..4f6e95b0 100644 --- a/ethology/reid/backends/tensorrt_backend.py +++ b/ethology/reid/backends/tensorrt_backend.py @@ -1,310 +1,400 @@ +# Note: LOGGER can be replaced with print or a local logger if needed +import os from collections import OrderedDict, namedtuple import numpy as np import torch from ethology.reid.backends.base_backend import BaseModelBackend -# Note: LOGGER can be replaced with print or a local logger if needed - -import os -import sys -import torch -import numpy as np -from collections import namedtuple, OrderedDict - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) class TensorRTBackend(BaseModelBackend): - def __init__(self, engine_path, device=None): - import hashlib - import requests - self.device = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) - self.fp16 = False - self.model_ = None - self.context = None - self.bindings = None - self.binding_addrs = None - self.is_trt10 = False - # Download engine if engine_path is a URL - if engine_path.startswith("http://") or engine_path.startswith("https://"): - # Use a hash of the URL for filename - engine_hash = hashlib.md5(engine_path.encode()).hexdigest() - filename = f"trt_engine_{engine_hash}.engine" - cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") - os.makedirs(cache_dir, exist_ok=True) - cached_file = os.path.join(cache_dir, filename) - if not os.path.exists(cached_file): - print(f"[TensorRT] Downloading engine from {engine_path} to {cached_file}") - with requests.get(engine_path, stream=True) as r: - r.raise_for_status() - with open(cached_file, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - else: - print(f"[TensorRT] Using cached engine at {cached_file}") - self.engine_path = cached_file - else: - self.engine_path = engine_path - self.load_model(self.engine_path) - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - try: - import tensorrt as trt - import pycuda.driver as cuda - import pycuda.autoinit # noqa: F401 - except ImportError: - raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH.") - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError("CUDA device not available for TensorRT inference.") - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f: - with trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features - - def load_model(self, w): - print(f"Loading {w} for TensorRT inference...") - # self.checker.check_packages(("nvidia-tensorrt",)) - try: - import tensorrt as trt # TensorRT library - except ImportError: - raise ImportError("Please install tensorrt to use this backend.") - - if self.device.type == "cpu": - if torch.cuda.is_available(): - self.device = torch.device("cuda:0") - else: - raise ValueError("CUDA device not available for TensorRT inference.") - - Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr")) - logger = trt.Logger(trt.Logger.INFO) - - # Deserialize the engine - with open(w, "rb") as f: - with trt.Runtime(logger) as runtime: - self.model_ = runtime.deserialize_cuda_engine(f.read()) - - # Execution context - self.context = self.model_.create_execution_context() - self.bindings = OrderedDict() - - self.is_trt10 = not hasattr(self.model_, "num_bindings") - num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings) - - # Parse bindings - for index in num: - if self.is_trt10: - name = self.model_.get_tensor_name(index) - dtype = trt.nptype(self.model_.get_tensor_dtype(name)) - is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT - if is_input and -1 in tuple(self.model_.get_tensor_shape(name)): - self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1])) - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_tensor_shape(name)) - - else: - name = self.model_.get_binding_name(index) - dtype = trt.nptype(self.model_.get_binding_dtype(index)) - is_input = self.model_.binding_is_input(index) - - # Handle dynamic shapes - if is_input and -1 in self.model_.get_binding_shape(index): - profile_index = 0 - min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index) - self.context.set_binding_shape(index, opt_shape) - - if is_input and dtype == np.float16: - self.fp16 = True - - shape = tuple(self.context.get_binding_shape(index)) - data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device) - self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) - - self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items()) - - def forward(self, im_batch): - temp_im_batch = im_batch.clone() - batch_array = [] - inp_batch = im_batch.shape[0] - out_batch = self.bindings["output"].shape[0] - resultant_features = [] - - # Divide batch to sub batches - while inp_batch > out_batch: - batch_array.append(temp_im_batch[:out_batch]) - temp_im_batch = temp_im_batch[out_batch:] - inp_batch = temp_im_batch.shape[0] - if temp_im_batch.shape[0] > 0: - batch_array.append(temp_im_batch) - - for temp_batch in batch_array: - # Adjust for dynamic shapes - if temp_batch.shape != self.bindings["images"].shape: - if self.is_trt10: - self.context.set_input_shape("images", temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output"))) - else: - i_in = self.model_.get_binding_index("images") - i_out = self.model_.get_binding_index("output") - self.context.set_binding_shape(i_in, temp_batch.shape) - self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape) - output_shape = tuple(self.context.get_binding_shape(i_out)) - self.bindings["output"].data.resize_(output_shape) - - s = self.bindings["images"].shape - assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}" - - self.binding_addrs["images"] = int(temp_batch.data_ptr()) - - # Execute inference - self.context.execute_v2(list(self.binding_addrs.values())) - features = self.bindings["output"].data - resultant_features.append(features.clone()) - - if len(resultant_features) == 1: - return resultant_features[0] - else: - rslt_features = torch.cat(resultant_features, dim=0) - rslt_features = rslt_features[: im_batch.shape[0]] - return rslt_features + def __init__(self, engine_path, device=None): + import hashlib + + import requests + + self.device = device or ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + self.fp16 = False + self.model_ = None + self.context = None + self.bindings = None + self.binding_addrs = None + self.is_trt10 = False + # Download engine if engine_path is a URL + if engine_path.startswith("http://") or engine_path.startswith( + "https://" + ): + # Use a hash of the URL for filename + engine_hash = hashlib.md5(engine_path.encode()).hexdigest() + filename = f"trt_engine_{engine_hash}.engine" + cache_dir = os.path.expanduser("~/.cache/ethology/tensorrt/") + os.makedirs(cache_dir, exist_ok=True) + cached_file = os.path.join(cache_dir, filename) + if not os.path.exists(cached_file): + print( + f"[TensorRT] Downloading engine from {engine_path} to {cached_file}" + ) + with requests.get(engine_path, stream=True) as r: + r.raise_for_status() + with open(cached_file, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + else: + print(f"[TensorRT] Using cached engine at {cached_file}") + self.engine_path = cached_file + else: + self.engine_path = engine_path + self.load_model(self.engine_path) + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + try: + import pycuda.autoinit # noqa: F401 + import pycuda.driver as cuda + import tensorrt as trt + except ImportError: + raise ImportError( + "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libnvinfer.so.8 is available in LD_LIBRARY_PATH." + ) + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError( + "CUDA device not available for TensorRT inference." + ) + + Binding = namedtuple( + "Binding", ("name", "dtype", "shape", "data", "ptr") + ) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f, trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + self.bindings["output"].data.resize_( + tuple(self.context.get_tensor_shape("output")) + ) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, ( + f"Input size {temp_batch.shape} does not match model size {s}" + ) + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features + + def load_model(self, w): + print(f"Loading {w} for TensorRT inference...") + # self.checker.check_packages(("nvidia-tensorrt",)) + try: + import tensorrt as trt # TensorRT library + except ImportError: + raise ImportError("Please install tensorrt to use this backend.") + + if self.device.type == "cpu": + if torch.cuda.is_available(): + self.device = torch.device("cuda:0") + else: + raise ValueError( + "CUDA device not available for TensorRT inference." + ) + + Binding = namedtuple( + "Binding", ("name", "dtype", "shape", "data", "ptr") + ) + logger = trt.Logger(trt.Logger.INFO) + + # Deserialize the engine + with open(w, "rb") as f, trt.Runtime(logger) as runtime: + self.model_ = runtime.deserialize_cuda_engine(f.read()) + + # Execution context + self.context = self.model_.create_execution_context() + self.bindings = OrderedDict() + + self.is_trt10 = not hasattr(self.model_, "num_bindings") + num = ( + range(self.model_.num_io_tensors) + if self.is_trt10 + else range(self.model_.num_bindings) + ) + + # Parse bindings + for index in num: + if self.is_trt10: + name = self.model_.get_tensor_name(index) + dtype = trt.nptype(self.model_.get_tensor_dtype(name)) + is_input = ( + self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT + ) + if is_input and -1 in tuple( + self.model_.get_tensor_shape(name) + ): + self.context.set_input_shape( + name, + tuple( + self.model_.get_tensor_profile_shape(name, 0)[1] + ), + ) + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_tensor_shape(name)) + + else: + name = self.model_.get_binding_name(index) + dtype = trt.nptype(self.model_.get_binding_dtype(index)) + is_input = self.model_.binding_is_input(index) + + # Handle dynamic shapes + if is_input and -1 in self.model_.get_binding_shape(index): + profile_index = 0 + min_shape, opt_shape, max_shape = ( + self.model_.get_profile_shape(profile_index, index) + ) + self.context.set_binding_shape(index, opt_shape) + + if is_input and dtype == np.float16: + self.fp16 = True + + shape = tuple(self.context.get_binding_shape(index)) + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to( + self.device + ) + self.bindings[name] = Binding( + name, dtype, shape, data, int(data.data_ptr()) + ) + + self.binding_addrs = OrderedDict( + (n, d.ptr) for n, d in self.bindings.items() + ) + + def forward(self, im_batch): + temp_im_batch = im_batch.clone() + batch_array = [] + inp_batch = im_batch.shape[0] + out_batch = self.bindings["output"].shape[0] + resultant_features = [] + + # Divide batch to sub batches + while inp_batch > out_batch: + batch_array.append(temp_im_batch[:out_batch]) + temp_im_batch = temp_im_batch[out_batch:] + inp_batch = temp_im_batch.shape[0] + if temp_im_batch.shape[0] > 0: + batch_array.append(temp_im_batch) + + for temp_batch in batch_array: + # Adjust for dynamic shapes + if temp_batch.shape != self.bindings["images"].shape: + if self.is_trt10: + self.context.set_input_shape("images", temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + self.bindings["output"].data.resize_( + tuple(self.context.get_tensor_shape("output")) + ) + else: + i_in = self.model_.get_binding_index("images") + i_out = self.model_.get_binding_index("output") + self.context.set_binding_shape(i_in, temp_batch.shape) + self.bindings["images"] = self.bindings["images"]._replace( + shape=temp_batch.shape + ) + output_shape = tuple(self.context.get_binding_shape(i_out)) + self.bindings["output"].data.resize_(output_shape) + + s = self.bindings["images"].shape + assert temp_batch.shape == s, ( + f"Input size {temp_batch.shape} does not match model size {s}" + ) + + self.binding_addrs["images"] = int(temp_batch.data_ptr()) + + # Execute inference + self.context.execute_v2(list(self.binding_addrs.values())) + features = self.bindings["output"].data + resultant_features.append(features.clone()) + + if len(resultant_features) == 1: + return resultant_features[0] + else: + rslt_features = torch.cat(resultant_features, dim=0) + rslt_features = rslt_features[: im_batch.shape[0]] + return rslt_features diff --git a/ethology/reid/backends/tflite_backend.py b/ethology/reid/backends/tflite_backend.py index b0a7b707..eb10d4e8 100644 --- a/ethology/reid/backends/tflite_backend.py +++ b/ethology/reid/backends/tflite_backend.py @@ -4,37 +4,39 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed + class TFLiteBackend(BaseModelBackend): - """ - A class to handle TensorFlow Lite model inference with dynamic batch size support. - """ - def __init__(self, weights: Path, device: str, half: bool): - super().__init__(weights, device, half) - self.nhwc = True - self.half = False - - def load_model(self, w): - # self.checker.check_packages(("tensorflow",)) - print(f"Loading {str(w)} for TensorFlow Lite inference...") - import tensorflow as tf - self.interpreter = tf.lite.Interpreter(model_path=str(w)) - self.interpreter.allocate_tensors() - self.input_details = self.interpreter.get_input_details() - self.output_details = self.interpreter.get_output_details() - self.current_allocated_batch_size = self.input_details[0]["shape"][0] - - def forward(self, im_batch: torch.Tensor) -> np.ndarray: - im_batch = im_batch.cpu().numpy() - batch_size = im_batch.shape[0] - if batch_size != self.current_allocated_batch_size: - self.interpreter.resize_tensor_input( - self.input_details[0]["index"], [batch_size, 256, 128, 3] - ) - self.interpreter.allocate_tensors() - self.current_allocated_batch_size = batch_size - self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) - self.interpreter.invoke() - features = self.interpreter.get_tensor(self.output_details[0]["index"]) - return features + """A class to handle TensorFlow Lite model inference with dynamic batch size support.""" + + def __init__(self, weights: Path, device: str, half: bool): + super().__init__(weights, device, half) + self.nhwc = True + self.half = False + + def load_model(self, w): + # self.checker.check_packages(("tensorflow",)) + print(f"Loading {str(w)} for TensorFlow Lite inference...") + import tensorflow as tf + + self.interpreter = tf.lite.Interpreter(model_path=str(w)) + self.interpreter.allocate_tensors() + self.input_details = self.interpreter.get_input_details() + self.output_details = self.interpreter.get_output_details() + self.current_allocated_batch_size = self.input_details[0]["shape"][0] + + def forward(self, im_batch: torch.Tensor) -> np.ndarray: + im_batch = im_batch.cpu().numpy() + batch_size = im_batch.shape[0] + if batch_size != self.current_allocated_batch_size: + self.interpreter.resize_tensor_input( + self.input_details[0]["index"], [batch_size, 256, 128, 3] + ) + self.interpreter.allocate_tensors() + self.current_allocated_batch_size = batch_size + self.interpreter.set_tensor(self.input_details[0]["index"], im_batch) + self.interpreter.invoke() + features = self.interpreter.get_tensor(self.output_details[0]["index"]) + return features diff --git a/ethology/reid/backends/torchscript_backend.py b/ethology/reid/backends/torchscript_backend.py index b6602171..1142fcc4 100644 --- a/ethology/reid/backends/torchscript_backend.py +++ b/ethology/reid/backends/torchscript_backend.py @@ -1,20 +1,21 @@ import torch from ethology.reid.backends.base_backend import BaseModelBackend + # Note: LOGGER can be replaced with print or a local logger if needed -class TorchscriptBackend(BaseModelBackend): - def __init__(self, weights, device, half): - super().__init__(weights, device, half) - self.nhwc = False - self.half = half +class TorchscriptBackend(BaseModelBackend): + def __init__(self, weights, device, half): + super().__init__(weights, device, half) + self.nhwc = False + self.half = half - def load_model(self, w): - print(f"Loading {w} for TorchScript inference...") - self.model = torch.jit.load(w) - self.model.half() if self.half else self.model.float() + def load_model(self, w): + print(f"Loading {w} for TorchScript inference...") + self.model = torch.jit.load(w) + self.model.half() if self.half else self.model.float() - def forward(self, im_batch): - features = self.model(im_batch) - return features + def forward(self, im_batch): + features = self.model(im_batch) + return features diff --git a/ethology/reid/core/auto_backend.py b/ethology/reid/core/auto_backend.py index 6f43eba2..22f2c4e2 100644 --- a/ethology/reid/core/auto_backend.py +++ b/ethology/reid/core/auto_backend.py @@ -1,74 +1,89 @@ - from pathlib import Path -from typing import Tuple, Union + import torch + from ethology.reid.backends.onnx_backend import ONNXBackend from ethology.reid.backends.openvino_backend import OpenVinoBackend from ethology.reid.backends.pytorch_backend import PyTorchBackend + try: - from ethology.reid.backends.tensorrt_backend import TensorRTBackend + from ethology.reid.backends.tensorrt_backend import TensorRTBackend except ImportError: - class TensorRTBackend: - def __init__(self, *args, **kwargs): - raise ImportError("TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH.") + + class TensorRTBackend: + def __init__(self, *args, **kwargs): + raise ImportError( + "TensorRT and pycuda are required for TensorRTBackend. Please install them and ensure libcudnn.so.8 is available in LD_LIBRARY_PATH." + ) + + from ethology.reid.backends.tflite_backend import TFLiteBackend from ethology.reid.backends.torchscript_backend import TorchscriptBackend + # from ethology.reid.core import export_formats # If needed, implement or copy export_formats # from ethology.utils import WEIGHTS # If needed, implement or set WEIGHTS # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER # from ethology.utils.torch_utils import select_device # If needed, implement or set select_device + class ReidAutoBackend: - def __init__( - self, - weights: Path, - device: torch.device = torch.device("cpu"), - half: bool = False, - ): - super().__init__() - w = weights[0] if isinstance(weights, list) else weights - ( - self.pt, - self.pth, - self.jit, - self.onnx, - self.xml, - self.engine, - self.tflite, - ) = self.model_type(w) - self.weights = weights - self.device = device # For simplicity, skip select_device for now - self.half = half - self.model = self.get_backend() + def __init__( + self, + weights: Path, + device: torch.device = torch.device("cpu"), + half: bool = False, + ): + super().__init__() + w = weights[0] if isinstance(weights, list) else weights + ( + self.pt, + self.pth, + self.jit, + self.onnx, + self.xml, + self.engine, + self.tflite, + ) = self.model_type(w) + self.weights = weights + self.device = device # For simplicity, skip select_device for now + self.half = half + self.model = self.get_backend() - def get_backend(self): - backend_map = { - self.pt or self.pth: PyTorchBackend, - self.jit: TorchscriptBackend, - self.onnx: ONNXBackend, - self.engine: TensorRTBackend, - self.xml: OpenVinoBackend, - self.tflite: TFLiteBackend, - } - for condition, backend_class in backend_map.items(): - if condition: - return backend_class(self.weights, self.device, self.half) - raise RuntimeError("This model framework is not supported yet!") + def get_backend(self): + backend_map = { + self.pt or self.pth: PyTorchBackend, + self.jit: TorchscriptBackend, + self.onnx: ONNXBackend, + self.engine: TensorRTBackend, + self.xml: OpenVinoBackend, + self.tflite: TFLiteBackend, + } + for condition, backend_class in backend_map.items(): + if condition: + return backend_class(self.weights, self.device, self.half) + raise RuntimeError("This model framework is not supported yet!") - def check_suffix(self, file: Path = "osnet_x0_25_msmt17.pt", suffix: Union[str, Tuple[str, ...]] = (".pt",), msg: str = ""): - suffix = [suffix] if isinstance(suffix, str) else list(suffix) - files = [file] if isinstance(file, (str, Path)) else list(file) - for f in files: - file_suffix = Path(f).suffix.lower() - if file_suffix and file_suffix not in suffix: - print(f"File {f} does not have an acceptable suffix. Expected: {suffix}") + def check_suffix( + self, + file: Path = "osnet_x0_25_msmt17.pt", + suffix: str | tuple[str, ...] = (".pt",), + msg: str = "", + ): + suffix = [suffix] if isinstance(suffix, str) else list(suffix) + files = [file] if isinstance(file, (str, Path)) else list(file) + for f in files: + file_suffix = Path(f).suffix.lower() + if file_suffix and file_suffix not in suffix: + print( + f"File {f} does not have an acceptable suffix. Expected: {suffix}" + ) - def model_type(self, p: Path) -> Tuple[bool, ...]: - # For demo, just check for .pt - sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] - self.check_suffix(p, sf) - types = [str(Path(p)).endswith(s) for s in sf] - # OpenVINO explicit check - if Path(p).suffix in ['.xml', '.bin']: - types[3] = True - return tuple(types) + def model_type(self, p: Path) -> tuple[bool, ...]: + # For demo, just check for .pt + sf = [".pt", ".pth", ".jit", ".onnx", ".xml", ".engine", ".tflite"] + self.check_suffix(p, sf) + types = [str(Path(p)).endswith(s) for s in sf] + # OpenVINO explicit check + if Path(p).suffix in [".xml", ".bin"]: + types[3] = True + return tuple(types) diff --git a/ethology/reid/core/config.py b/ethology/reid/core/config.py index 926c0cc9..dc17cc14 100644 --- a/ethology/reid/core/config.py +++ b/ethology/reid/core/config.py @@ -1,16 +1,16 @@ MODEL_TYPES = [ - "resnet50", - "resnet101", - "mlfn", - "hacnn", - "mobilenetv2_x1_0", - "mobilenetv2_x1_4", - "osnet_x1_0", - "osnet_x0_75", - "osnet_x0_5", - "osnet_x0_25", - "osnet_ibn_x1_0", - "osnet_ain_x1_0", - "lmbn_n", - "clip", + "resnet50", + "resnet101", + "mlfn", + "hacnn", + "mobilenetv2_x1_0", + "mobilenetv2_x1_4", + "osnet_x1_0", + "osnet_x0_75", + "osnet_x0_5", + "osnet_x0_25", + "osnet_ibn_x1_0", + "osnet_ain_x1_0", + "lmbn_n", + "clip", ] diff --git a/ethology/reid/core/factory.py b/ethology/reid/core/factory.py index bc8b6ab1..27406383 100644 --- a/ethology/reid/core/factory.py +++ b/ethology/reid/core/factory.py @@ -1,30 +1,44 @@ - # Import model constructors from ethology's local backbones from ethology.reid.backbones.hacnn import HACNN from ethology.reid.backbones.mlfn import mlfn -from ethology.reid.backbones.mobilenetv2 import mobilenetv2_x1_0, mobilenetv2_x1_4 -from ethology.reid.backbones.osnet import osnet_ibn_x1_0, osnet_x0_5, osnet_x0_25, osnet_x0_75, osnet_x1_0 -from ethology.reid.backbones.osnet_ain import osnet_ain_x0_5, osnet_ain_x0_25, osnet_ain_x0_75, osnet_ain_x1_0 +from ethology.reid.backbones.mobilenetv2 import ( + mobilenetv2_x1_0, + mobilenetv2_x1_4, +) +from ethology.reid.backbones.osnet import ( + osnet_ibn_x1_0, + osnet_x0_5, + osnet_x0_25, + osnet_x0_75, + osnet_x1_0, +) +from ethology.reid.backbones.osnet_ain import ( + osnet_ain_x0_5, + osnet_ain_x0_25, + osnet_ain_x0_75, + osnet_ain_x1_0, +) from ethology.reid.backbones.resnet import resnet50, resnet101 + # from ethology.reid.backbones.lmbn.lmbn_n import LMBN_n # If present # from ethology.reid.backbones.clip.make_model import make_model # If present MODEL_FACTORY = { - "resnet50": resnet50, - "resnet101": resnet101, - "mobilenetv2_x1_0": mobilenetv2_x1_0, - "mobilenetv2_x1_4": mobilenetv2_x1_4, - "hacnn": HACNN, - "mlfn": mlfn, - "osnet_x1_0": osnet_x1_0, - "osnet_x0_75": osnet_x0_75, - "osnet_x0_5": osnet_x0_5, - "osnet_x0_25": osnet_x0_25, - "osnet_ibn_x1_0": osnet_ibn_x1_0, - "osnet_ain_x1_0": osnet_ain_x1_0, - "osnet_ain_x0_75": osnet_ain_x0_75, - "osnet_ain_x0_5": osnet_ain_x0_5, - "osnet_ain_x0_25": osnet_ain_x0_25, - # "lmbn_n": LMBN_n, # Uncomment if implemented - # "clip": make_model, # Uncomment if implemented + "resnet50": resnet50, + "resnet101": resnet101, + "mobilenetv2_x1_0": mobilenetv2_x1_0, + "mobilenetv2_x1_4": mobilenetv2_x1_4, + "hacnn": HACNN, + "mlfn": mlfn, + "osnet_x1_0": osnet_x1_0, + "osnet_x0_75": osnet_x0_75, + "osnet_x0_5": osnet_x0_5, + "osnet_x0_25": osnet_x0_25, + "osnet_ibn_x1_0": osnet_ibn_x1_0, + "osnet_ain_x1_0": osnet_ain_x1_0, + "osnet_ain_x0_75": osnet_ain_x0_75, + "osnet_ain_x0_5": osnet_ain_x0_5, + "osnet_ain_x0_25": osnet_ain_x0_25, + # "lmbn_n": LMBN_n, # Uncomment if implemented + # "clip": make_model, # Uncomment if implemented } diff --git a/ethology/reid/core/handler.py b/ethology/reid/core/handler.py index b5e51391..ba521ab2 100644 --- a/ethology/reid/core/handler.py +++ b/ethology/reid/core/handler.py @@ -2,32 +2,35 @@ # Thin wrapper to use BoxMOT ReID models in ethology from pathlib import Path -from typing import Union -import numpy as np +import numpy as np # Import ethology's local ReID handler from ethology.reid.core.reid_handler import ReID as EthologyReID + class ReIDHandler: - """ - Ethology ReID handler using local models and backends. - """ - def __init__(self, weights: Union[str, Path], device='cpu', half=False): + """Ethology ReID handler using local models and backends.""" + + def __init__(self, weights: str | Path, device="cpu", half=False): self.model = EthologyReID(weights=weights, device=device, half=half) - def extract_features(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: - """ - Extract feature embeddings for detections in a frame. + def extract_features( + self, frame: np.ndarray, dets: np.ndarray + ) -> np.ndarray: + """Extract feature embeddings for detections in a frame. + Parameters ---------- frame : np.ndarray (H, W, C) BGR image. dets : np.ndarray (N, 6) array of detections (x1, y1, x2, y2, conf, cls). + Returns ------- np.ndarray (N, D) feature embeddings. + """ return self.model(frame, dets) diff --git a/ethology/reid/core/registry.py b/ethology/reid/core/registry.py index 333cff2f..4b9c27fd 100644 --- a/ethology/reid/core/registry.py +++ b/ethology/reid/core/registry.py @@ -1,71 +1,88 @@ - from collections import OrderedDict + import torch -from ethology.reid.core.config import MODEL_TYPES #, NR_CLASSES_DICT, TRAINED_URLS + +from ethology.reid.core.config import ( + MODEL_TYPES, # , NR_CLASSES_DICT, TRAINED_URLS +) from ethology.reid.core.factory import MODEL_FACTORY + # from ethology.utils import logger as LOGGER # If needed, implement or set LOGGER + class ReIDModelRegistry: - """Encapsulates model registration and related utilities.""" + """Encapsulates model registration and related utilities.""" - @staticmethod - def show_downloadable_models(): - # LOGGER.info("Available .pt ReID models for automatic download") - # LOGGER.info(list(TRAINED_URLS.keys())) - pass + @staticmethod + def show_downloadable_models(): + # LOGGER.info("Available .pt ReID models for automatic download") + # LOGGER.info(list(TRAINED_URLS.keys())) + pass - @staticmethod - def get_model_name(model): - for name in MODEL_TYPES: - if name in model.name: - return name - return None + @staticmethod + def get_model_name(model): + for name in MODEL_TYPES: + if name in model.name: + return name + return None - @staticmethod - def get_model_url(model): - # return TRAINED_URLS.get(model.name, None) - return None + @staticmethod + def get_model_url(model): + # return TRAINED_URLS.get(model.name, None) + return None - @staticmethod - def load_pretrained_weights(model, weight_path): - device = "cpu" if not torch.cuda.is_available() else None - checkpoint = torch.load( - weight_path, - map_location=torch.device("cpu") if device == "cpu" else None, - weights_only=False, - encoding='latin1', - ) - state_dict = checkpoint.get("state_dict", checkpoint) - model_dict = model.state_dict() - new_state_dict = OrderedDict() - matched_layers, discarded_layers = [], [] - for k, v in state_dict.items(): - key = k[7:] if k.startswith("module.") else k - if key in model_dict and model_dict[key].size() == v.size(): - new_state_dict[key] = v - matched_layers.append(key) - else: - discarded_layers.append(key) - model_dict.update(new_state_dict) - model.load_state_dict(model_dict) + @staticmethod + def load_pretrained_weights(model, weight_path): + device = "cpu" if not torch.cuda.is_available() else None + checkpoint = torch.load( + weight_path, + map_location=torch.device("cpu") if device == "cpu" else None, + weights_only=False, + encoding="latin1", + ) + state_dict = checkpoint.get("state_dict", checkpoint) + model_dict = model.state_dict() + new_state_dict = OrderedDict() + matched_layers, discarded_layers = [], [] + for k, v in state_dict.items(): + key = k[7:] if k.startswith("module.") else k + if key in model_dict and model_dict[key].size() == v.size(): + new_state_dict[key] = v + matched_layers.append(key) + else: + discarded_layers.append(key) + model_dict.update(new_state_dict) + model.load_state_dict(model_dict) - @staticmethod - def show_available_models(): - # LOGGER.info("Available models:") - # LOGGER.info(list(MODEL_FACTORY.keys())) - pass + @staticmethod + def show_available_models(): + # LOGGER.info("Available models:") + # LOGGER.info(list(MODEL_FACTORY.keys())) + pass - @staticmethod - def get_nr_classes(weights): - # dataset_key = weights.name.split("_")[1] - # return NR_CLASSES_DICT.get(dataset_key, 1) - return 1 + @staticmethod + def get_nr_classes(weights): + # dataset_key = weights.name.split("_")[1] + # return NR_CLASSES_DICT.get(dataset_key, 1) + return 1 - @staticmethod - def build_model(name, weights, num_classes, loss="softmax", pretrained=True, use_gpu=True): - if name not in MODEL_FACTORY: - available = list(MODEL_FACTORY.keys()) - raise KeyError(f"Unknown model '{name}'. Must be one of {available}") - return MODEL_FACTORY[name]( - num_classes=num_classes, loss=loss, pretrained=pretrained, use_gpu=use_gpu - ) + @staticmethod + def build_model( + name, + weights, + num_classes, + loss="softmax", + pretrained=True, + use_gpu=True, + ): + if name not in MODEL_FACTORY: + available = list(MODEL_FACTORY.keys()) + raise KeyError( + f"Unknown model '{name}'. Must be one of {available}" + ) + return MODEL_FACTORY[name]( + num_classes=num_classes, + loss=loss, + pretrained=pretrained, + use_gpu=use_gpu, + ) diff --git a/ethology/reid/core/reid_handler.py b/ethology/reid/core/reid_handler.py index 2c72658a..62d42209 100644 --- a/ethology/reid/core/reid_handler.py +++ b/ethology/reid/core/reid_handler.py @@ -1,28 +1,33 @@ - from pathlib import Path -from typing import Union + import numpy as np + from ethology.reid.core.auto_backend import ReidAutoBackend + class ReID: - def __init__(self, weights: Union[str, Path], device='cpu', half=False): - self.weights = Path(weights) - self.device = device - self.half = half - self.backend = ReidAutoBackend(weights=self.weights, device=device, half=half) - self.model = self.backend.model - - def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: - """ - Extract features for detections in a frame. - Args: - frame: (H, W, C) BGR image - dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. - Returns: - embs: (N, D) embeddings. - """ - if dets.shape[0] == 0: - return np.empty((0, 0)) - xyxy = dets[:, :4] - embs = self.model.get_features(xyxy, frame) - return embs + def __init__(self, weights: str | Path, device="cpu", half=False): + self.weights = Path(weights) + self.device = device + self.half = half + self.backend = ReidAutoBackend( + weights=self.weights, device=device, half=half + ) + self.model = self.backend.model + + def __call__(self, frame: np.ndarray, dets: np.ndarray) -> np.ndarray: + """Extract features for detections in a frame. + + Args: + frame: (H, W, C) BGR image + dets: (N, 6) detections (x1, y1, x2, y2, conf, cls) or similar. + + Returns: + embs: (N, D) embeddings. + + """ + if dets.shape[0] == 0: + return np.empty((0, 0)) + xyxy = dets[:, :4] + embs = self.model.get_features(xyxy, frame) + return embs diff --git a/examples/reid_mot_example.py b/examples/reid_mot_example.py index 3862e1dd..304fd9b1 100644 --- a/examples/reid_mot_example.py +++ b/examples/reid_mot_example.py @@ -1,5 +1,4 @@ -""" -Example: Using the new ReID trajectory utility with MOT results +"""Example: Using the new ReID trajectory utility with MOT results This script demonstrates how to use the ethology reid trajectory handler with a sample MOT output. """ @@ -8,16 +7,16 @@ # Example: Dummy MOT results (replace with your actual MOT output) mot_results = [ - {'id': 1, 'trajectory': [(0, 0), (1, 1), (2, 2)]}, - {'id': 2, 'trajectory': [(5, 5), (6, 6), (7, 7)]}, + {"id": 1, "trajectory": [(0, 0), (1, 1), (2, 2)]}, + {"id": 2, "trajectory": [(5, 5), (6, 6), (7, 7)]}, ] # Initialize the handler (adjust parameters as needed) -reid_handler = ReIDTrajectoryHandler(model_name='osnet', device='cpu') +reid_handler = ReIDTrajectoryHandler(model_name="osnet", device="cpu") # Run re-identification on the MOT results reid_results = reid_handler.reidentify(mot_results) -print('ReID Results:') +print("ReID Results:") for item in reid_results: print(item) diff --git a/tests/test_unit/test_reid_handler.py b/tests/test_unit/test_reid_handler.py index 3a5146cf..bc8a199c 100644 --- a/tests/test_unit/test_reid_handler.py +++ b/tests/test_unit/test_reid_handler.py @@ -1,12 +1,16 @@ import numpy as np + from ethology.reid.core.handler import ReIDHandler + def test_extract_features_shape(): - handler = ReIDHandler(weights='osnet_x0_25_imagenet.pth') + handler = ReIDHandler(weights="osnet_x0_25_imagenet.pth") frame = np.random.randint(0, 255, (128, 64, 3), dtype=np.uint8) - dets = np.array([ - [10, 10, 50, 100, 0.9, 1], - [60, 20, 100, 110, 0.8, 2], - ]) + dets = np.array( + [ + [10, 10, 50, 100, 0.9, 1], + [60, 20, 100, 110, 0.8, 2], + ] + ) feats = handler.extract_features(frame, dets) assert feats.shape[0] == dets.shape[0] From a377384b42706a5185ceb6a602dae899816fab96 Mon Sep 17 00:00:00 2001 From: AnandMayank Date: Wed, 18 Feb 2026 20:16:58 +0530 Subject: [PATCH 10/12] fix(reid): address linter and docstring issues in resnet, mlfn, mobilenetv2 backbones --- ethology/reid/backbones/mlfn.py | 13 ++-- ethology/reid/backbones/mobilenetv2.py | 82 ++++++++++++++------------ ethology/reid/backbones/resnet.py | 5 +- 3 files changed, 54 insertions(+), 46 deletions(-) diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index 3d04d003..7e490d0a 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -5,6 +5,10 @@ from torch import nn from torch.nn import functional as F +""" +MLFN backbone for person re-identification. +""" + __all__ = ["mlfn"] model_urls = { # training epoch = 5, top1 = 51.6 @@ -21,7 +25,7 @@ def __init__( mid_channels = out_channels // 2 # Factor Modules - self.fm_conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False) + super().__init__() self.fm_bn1 = nn.BatchNorm2d(mid_channels) self.fm_conv2 = nn.Conv2d( mid_channels, @@ -114,7 +118,7 @@ def __init__( **kwargs, ): super(MLFN, self).__init__() - self.loss = loss + channels=None, self.groups = groups # first convolutional layer @@ -264,9 +268,10 @@ def mlfn(num_classes, loss="softmax", pretrained=True, **kwargs): warnings.warn( "The imagenet pretrained weights need to be manually downloaded from {}".format( model_urls["imagenet"] - ) + ), + stacklevel=2, ) return model -# Copied from boxmot/boxmot/reid/backbones/mlfn.py + diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py index b3e69186..5d004eb3 100644 --- a/ethology/reid/backbones/mobilenetv2.py +++ b/ethology/reid/backbones/mobilenetv2.py @@ -1,4 +1,7 @@ -# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license + +""" +MobileNetV2 backbone for person re-identification. +""" import torch.utils.model_zoo as model_zoo @@ -29,7 +32,10 @@ class ConvBlock(nn.Module): g (int): number of blocked connections from input channels to output channels (default: 1). - """ + def __init__(self, in_c, out_c, k, s=1, p=0, g=1): + super().__init__() + self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) + self.bn = nn.BatchNorm2d(out_c) def __init__(self, in_c, out_c, k, s=1, p=0, g=1): super(ConvBlock, self).__init__() @@ -43,18 +49,18 @@ def forward(self, x): class Bottleneck(nn.Module): - def __init__(self, in_channels, out_channels, expansion_factor, stride=1): - super(Bottleneck, self).__init__() - mid_channels = in_channels * expansion_factor - self.use_residual = stride == 1 and in_channels == out_channels - self.conv1 = ConvBlock(in_channels, mid_channels, 1) - self.dwconv2 = ConvBlock( - mid_channels, mid_channels, 3, stride, 1, g=mid_channels - ) - self.conv3 = nn.Sequential( - nn.Conv2d(mid_channels, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels), - ) + def __init__(self, in_channels, out_channels, expansion_factor, stride=1): + super().__init__() + mid_channels = in_channels * expansion_factor + self.use_residual = stride == 1 and in_channels == out_channels + self.conv1 = ConvBlock(in_channels, mid_channels, 1) + self.dwconv2 = ConvBlock( + mid_channels, mid_channels, 3, stride, 1, g=mid_channels + ) + self.conv3 = nn.Sequential( + nn.Conv2d(mid_channels, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + ) def forward(self, x): m = self.conv1(x) @@ -78,19 +84,19 @@ class MobileNetV2(nn.Module): - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. """ - def __init__( - self, - num_classes, - width_mult=1, - loss="softmax", - fc_dims=None, - dropout_p=None, - **kwargs, - ): - super(MobileNetV2, self).__init__() - self.loss = loss - self.in_channels = int(32 * width_mult) - self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super().__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 # construct layers self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) @@ -125,17 +131,17 @@ def __init__( self._init_params() - def _make_layer(self, block, t, c, n, s): - # t: expansion factor - # c: output channels - # n: number of blocks - # s: stride for first layer - layers = [] - layers.append(block(self.in_channels, c, t, s)) - self.in_channels = c - for i in range(1, n): - layers.append(block(self.in_channels, c, t)) - return nn.Sequential(*layers) + def _make_layer(self, block, t, c, n, s): + # t: expansion factor + # c: output channels + # n: number of blocks + # s: stride for first layer + layers = [] + layers.append(block(self.in_channels, c, t, s)) + self.in_channels = c + for _ in range(1, n): + layers.append(block(self.in_channels, c, t)) + return nn.Sequential(*layers) def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None): """Constructs fully connected layer. diff --git a/ethology/reid/backbones/resnet.py b/ethology/reid/backbones/resnet.py index 7cf28df1..0efa708e 100644 --- a/ethology/reid/backbones/resnet.py +++ b/ethology/reid/backbones/resnet.py @@ -1,13 +1,10 @@ # Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license -<<<<<<< HEAD -"""Code source: https://github.com/pytorch/vision""" -======= + """ Code source: https://github.com/pytorch/vision """ from __future__ import absolute_import, division ->>>>>>> a4dd694 (style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues) import torch.utils.model_zoo as model_zoo from torch import nn From d1a8ac749b8e6ef40bfcebcc8fa4e29a74b51a80 Mon Sep 17 00:00:00 2001 From: AnandMayank Date: Wed, 18 Feb 2026 20:26:55 +0530 Subject: [PATCH 11/12] fix(reid): resolve linter, docstring, and merge issues in backbone files --- ethology/reid/backbones/mlfn.py | 17 +++++----- ethology/reid/backbones/mobilenetv2.py | 43 ++++++++++++-------------- ethology/reid/backbones/resnet.py | 26 ++-------------- 3 files changed, 29 insertions(+), 57 deletions(-) diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index 7e490d0a..bdd019e1 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -1,3 +1,4 @@ + """MLFN backbone for person re-identification.""" import torch @@ -5,10 +6,6 @@ from torch import nn from torch.nn import functional as F -""" -MLFN backbone for person re-identification. -""" - __all__ = ["mlfn"] model_urls = { # training epoch = 5, top1 = 51.6 @@ -20,7 +17,7 @@ class MLFNBlock(nn.Module): def __init__( self, in_channels, out_channels, stride, fsm_channels, groups=32 ): - super(MLFNBlock, self).__init__() + super().__init__() self.groups = groups mid_channels = out_channels // 2 @@ -113,11 +110,13 @@ def __init__( num_classes, loss="softmax", groups=32, - channels=[64, 256, 512, 1024, 2048], + channels=None, embed_dim=1024, **kwargs, ): - super(MLFN, self).__init__() + super().__init__() + if channels is None: + channels = [64, 256, 512, 1024, 2048] channels=None, self.groups = groups @@ -244,9 +243,9 @@ def forward(self, x): def init_pretrained_weights(model, model_url): - """Initializes model with pretrained weights. + """Initialize model with pretrained weights. - Layers that don't match with pretrained layers in name or size are kept unchanged. + Keep layers unchanged if they don't match pretrained layers in name or size. """ pretrain_dict = model_zoo.load_url(model_url) model_dict = model.state_dict() diff --git a/ethology/reid/backbones/mobilenetv2.py b/ethology/reid/backbones/mobilenetv2.py index 5d004eb3..d0f5becd 100644 --- a/ethology/reid/backbones/mobilenetv2.py +++ b/ethology/reid/backbones/mobilenetv2.py @@ -37,12 +37,7 @@ def __init__(self, in_c, out_c, k, s=1, p=0, g=1): self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g) self.bn = nn.BatchNorm2d(out_c) - def __init__(self, in_c, out_c, k, s=1, p=0, g=1): - super(ConvBlock, self).__init__() - self.conv = nn.Conv2d( - in_c, out_c, k, stride=s, padding=p, bias=False, groups=g - ) - self.bn = nn.BatchNorm2d(out_c) + # Only keep the correct __init__ def forward(self, x): return F.relu6(self.bn(self.conv(x))) @@ -73,30 +68,30 @@ def forward(self, x): class MobileNetV2(nn.Module): - """MobileNetV2. + """ + MobileNetV2 backbone for person re-identification. Reference: - Sandler et al. MobileNetV2: Inverted Residuals and - Linear Bottlenecks. CVPR 2018. + Sandler et al. MobileNetV2: Inverted Residuals and Linear Bottlenecks. CVPR 2018. Public keys: - - ``mobilenetv2_x1_0``: MobileNetV2 x1.0. - - ``mobilenetv2_x1_4``: MobileNetV2 x1.4. + - mobilenetv2_x1_0: MobileNetV2 x1.0. + - mobilenetv2_x1_4: MobileNetV2 x1.4. """ - def __init__( - self, - num_classes, - width_mult=1, - loss="softmax", - fc_dims=None, - dropout_p=None, - **kwargs, - ): - super().__init__() - self.loss = loss - self.in_channels = int(32 * width_mult) - self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 + def __init__( + self, + num_classes, + width_mult=1, + loss="softmax", + fc_dims=None, + dropout_p=None, + **kwargs, + ): + super().__init__() + self.loss = loss + self.in_channels = int(32 * width_mult) + self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280 # construct layers self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1) diff --git a/ethology/reid/backbones/resnet.py b/ethology/reid/backbones/resnet.py index 0efa708e..8ee75172 100644 --- a/ethology/reid/backbones/resnet.py +++ b/ethology/reid/backbones/resnet.py @@ -1,16 +1,13 @@ -# Mikel Broström 🔥 BoxMOT 🧾 AGPL-3.0 license - """ Code source: https://github.com/pytorch/vision """ -from __future__ import absolute_import, division +from __future__ import absolute_import, division import torch.utils.model_zoo as model_zoo from torch import nn __all__ = [ -<<<<<<< HEAD "resnet18", "resnet34", "resnet50", @@ -29,26 +26,7 @@ "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", -======= - "resnet18", - "resnet34", - "resnet50", - "resnet101", - "resnet152", - "resnext50_32x4d", - "resnext101_32x8d", - "resnet50_fc512", -] - -model_urls = { - "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", - "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", - "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", - "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", - "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", - "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", - "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", ->>>>>>> a4dd694 (style(reid): fix ruff errors in hacnn.py and mlfn.py\n\n- Add missing docstrings\n- Use super() instead of super(Class, self)\n- Avoid mutable default arguments\n- Fix long lines and other ruff issues) +} } # ...existing code for conv3x3, conv1x1, BasicBlock, Bottleneck, ResNet, init_pretrained_weights, and instantiation functions... From d572f490159473af1d34045aa9d8465a163340b5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:57:54 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ethology/reid/backbones/mlfn.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ethology/reid/backbones/mlfn.py b/ethology/reid/backbones/mlfn.py index bdd019e1..c638a5d5 100644 --- a/ethology/reid/backbones/mlfn.py +++ b/ethology/reid/backbones/mlfn.py @@ -1,4 +1,3 @@ - """MLFN backbone for person re-identification.""" import torch @@ -117,7 +116,7 @@ def __init__( super().__init__() if channels is None: channels = [64, 256, 512, 1024, 2048] - channels=None, + channels = (None,) self.groups = groups # first convolutional layer @@ -271,6 +270,3 @@ def mlfn(num_classes, loss="softmax", pretrained=True, **kwargs): stacklevel=2, ) return model - - -