From 85c8d2ff12b8733723696dec54b97bd024e373f6 Mon Sep 17 00:00:00 2001 From: Maksym Ostapenko Date: Wed, 4 Dec 2019 01:38:42 +0200 Subject: [PATCH] Initial commit. --- .gitignore | 11 + README.md | 41 ++++ deep_sort/__init__.py | 0 deep_sort/detection.py | 48 +++++ deep_sort/iou_matching.py | 83 ++++++++ deep_sort/kalman_filter.py | 228 ++++++++++++++++++++ deep_sort/linear_assignment.py | 191 +++++++++++++++++ deep_sort/nn_matching.py | 176 ++++++++++++++++ deep_sort/preprocessing.py | 72 +++++++ deep_sort/track.py | 163 ++++++++++++++ deep_sort/tracker.py | 139 ++++++++++++ demo.py | 168 +++++++++++++++ tools/__init__.py | 0 tools/generate_detections.py | 181 ++++++++++++++++ yolo3/__init__.py | 0 yolo3/model.py | 373 +++++++++++++++++++++++++++++++++ yolo3/utils.py | 30 +++ yolo3/yolo.py | 119 +++++++++++ 18 files changed, 2023 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 deep_sort/__init__.py create mode 100644 deep_sort/detection.py create mode 100644 deep_sort/iou_matching.py create mode 100644 deep_sort/kalman_filter.py create mode 100644 deep_sort/linear_assignment.py create mode 100644 deep_sort/nn_matching.py create mode 100644 deep_sort/preprocessing.py create mode 100644 deep_sort/track.py create mode 100644 deep_sort/tracker.py create mode 100644 demo.py create mode 100644 tools/__init__.py create mode 100644 tools/generate_detections.py create mode 100644 yolo3/__init__.py create mode 100644 yolo3/model.py create mode 100644 yolo3/utils.py create mode 100644 yolo3/yolo.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1499b15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +input/* +output/* +.idea/* +__pycache__ +.ipynb_checkpoints +model_data/* + +*.pyc +*.swp +*.swo +*.swn diff --git a/README.md b/README.md new file mode 100644 index 0000000..bcf2c80 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# A3: People counter + +The main goal of the project is to count people on the streets. So all parameters are adjusted for the task. + +### Quick Start + +1. Clone repository. +2. Download converted weights of [yolo.h5 model file with tf-1.4.0](https://drive.google.com/file/d/1uvXFacPnrSMw6ldWTyLLjGLETlEsUvcE/view?usp=sharing)/ Put them into **model_data** folder. +3. Install requirements. +4. Specify path to input fileRun model with cmd : + ``` + python demo.py --videofile="path/to/your/videofile/" --out_root_dir="path/to/outptu/dir/" + ``` + +### Dependencies + + The code is compatible with Python 3. The following dependencies are needed to run the tracker: + + NumPy + sklean + OpenCV + Pillow + Keras + + Additionally, feature generation requires TensorFlow-1.4.0. + +### Run for other classes + +Be careful that the code ignores everything but person. Change class if you want run for other instance: + + [A3/yolo3/yolo.py]: + + if predicted_class != 'person': + continue + +### Notes for future work + You can use any Detector you like to replace Keras_version YOLO to get bboxes , for it is to slow ! + + Model file model_data/mars-small128.pb need by deep_sort had convert to tensorflow-1.4.0 + +**This work mainly based on https://github.com/Qidian213/deep_sort_yolov3. Thanks a lot guy.** \ No newline at end of file diff --git a/deep_sort/__init__.py b/deep_sort/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deep_sort/detection.py b/deep_sort/detection.py new file mode 100644 index 0000000..5d08b86 --- /dev/null +++ b/deep_sort/detection.py @@ -0,0 +1,48 @@ +import numpy as np + + +class Detection(object): + """ + This class represents a bounding box detection in a single image. + + Parameters + ---------- + tlwh : array_like + Bounding box in format `(x, y, w, h)`. + confidence : float + Detector confidence score. + feature : array_like + A feature vector that describes the object contained in this image. + + Attributes + ---------- + tlwh : ndarray + Bounding box in format `(top left x, top left y, width, height)`. + confidence : ndarray + Detector confidence score. + feature : ndarray | NoneType + A feature vector that describes the object contained in this image. + + """ + + def __init__(self, tlwh, confidence, feature): + self.tlwh = np.asarray(tlwh, dtype=np.float) + self.confidence = float(confidence) + self.feature = np.asarray(feature, dtype=np.float32) + + def to_tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + def to_xyah(self): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = self.tlwh.copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret diff --git a/deep_sort/iou_matching.py b/deep_sort/iou_matching.py new file mode 100644 index 0000000..65134e0 --- /dev/null +++ b/deep_sort/iou_matching.py @@ -0,0 +1,83 @@ +from __future__ import absolute_import + +import numpy as np + +from . import linear_assignment + + +def iou(bbox, candidates): + """Computer intersection over union. + + Parameters + ---------- + bbox : ndarray + A bounding box in format `(top left x, top left y, width, height)`. + candidates : ndarray + A matrix of candidate bounding boxes (one per row) in the same format + as `bbox`. + + Returns + ------- + ndarray + The intersection over union in [0, 1] between the `bbox` and each + candidate. A higher score means a larger fraction of the `bbox` is + occluded by the candidate. + + """ + bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] + candidates_tl = candidates[:, :2] + candidates_br = candidates[:, :2] + candidates[:, 2:] + + tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], + np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] + br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], + np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] + wh = np.maximum(0., br - tl) + + area_intersection = wh.prod(axis=1) + area_bbox = bbox[2:].prod() + area_candidates = candidates[:, 2:].prod(axis=1) + return area_intersection / (area_bbox + area_candidates - area_intersection) + + +def iou_cost(tracks, detections, track_indices=None, + detection_indices=None): + """An intersection over union distance metric. + + Parameters + ---------- + tracks : List[deep_sort.track.Track] + A list of tracks. + detections : List[deep_sort.detection.Detection] + A list of detections. + track_indices : Optional[List[int]] + A list of indices to tracks that should be matched. Defaults to + all `tracks`. + detection_indices : Optional[List[int]] + A list of indices to detections that should be matched. Defaults + to all `detections`. + + Returns + ------- + ndarray + Returns a cost matrix of shape + len(track_indices), len(detection_indices) where entry (i, j) is + `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + cost_matrix = np.zeros((len(track_indices), len(detection_indices))) + for row, track_idx in enumerate(track_indices): + if tracks[track_idx].time_since_update > 1: + cost_matrix[row, :] = linear_assignment.INFTY_COST + continue + + bbox = tracks[track_idx].to_tlwh() + candidates = np.asarray([detections[i].tlwh for i in detection_indices]) + cost_matrix[row, :] = 1. - iou(bbox, candidates) + + return cost_matrix diff --git a/deep_sort/kalman_filter.py b/deep_sort/kalman_filter.py new file mode 100644 index 0000000..076cd5c --- /dev/null +++ b/deep_sort/kalman_filter.py @@ -0,0 +1,228 @@ +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(self._motion_mat, mean) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha diff --git a/deep_sort/linear_assignment.py b/deep_sort/linear_assignment.py new file mode 100644 index 0000000..51a08b6 --- /dev/null +++ b/deep_sort/linear_assignment.py @@ -0,0 +1,191 @@ +from __future__ import absolute_import + +import numpy as np +from sklearn.utils.linear_assignment_ import linear_assignment + +from . import kalman_filter + + +INFTY_COST = 1e+5 + + +def min_cost_matching( + distance_metric, max_distance, tracks, detections, track_indices=None, + detection_indices=None): + """Solve linear assignment problem. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection_indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + if len(detection_indices) == 0 or len(track_indices) == 0: + return [], track_indices, detection_indices # Nothing to match. + + cost_matrix = distance_metric( + tracks, detections, track_indices, detection_indices) + cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 + indices = linear_assignment(cost_matrix) + + matches, unmatched_tracks, unmatched_detections = [], [], [] + for col, detection_idx in enumerate(detection_indices): + if col not in indices[:, 1]: + unmatched_detections.append(detection_idx) + for row, track_idx in enumerate(track_indices): + if row not in indices[:, 0]: + unmatched_tracks.append(track_idx) + for row, col in indices: + track_idx = track_indices[row] + detection_idx = detection_indices[col] + if cost_matrix[row, col] > max_distance: + unmatched_tracks.append(track_idx) + unmatched_detections.append(detection_idx) + else: + matches.append((track_idx, detection_idx)) + return matches, unmatched_tracks, unmatched_detections + + +def matching_cascade( + distance_metric, max_distance, cascade_depth, tracks, detections, + track_indices=None, detection_indices=None): + """Run matching cascade. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + cascade_depth: int + The cascade depth, should be se to the maximum track age. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : Optional[List[int]] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). Defaults to all tracks. + detection_indices : Optional[List[int]] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). Defaults to all + detections. + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = list(range(len(tracks))) + if detection_indices is None: + detection_indices = list(range(len(detections))) + + unmatched_detections = detection_indices + matches = [] + for level in range(cascade_depth): + if len(unmatched_detections) == 0: # No detections left + break + + track_indices_l = [ + k for k in track_indices + if tracks[k].time_since_update == 1 + level + ] + if len(track_indices_l) == 0: # Nothing to match at this level + continue + + matches_l, _, unmatched_detections = \ + min_cost_matching( + distance_metric, max_distance, tracks, detections, + track_indices_l, unmatched_detections) + matches += matches_l + unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) + return matches, unmatched_tracks, unmatched_detections + + +def gate_cost_matrix( + kf, cost_matrix, tracks, detections, track_indices, detection_indices, + gated_cost=INFTY_COST, only_position=False): + """Invalidate infeasible entries in cost matrix based on the state + distributions obtained by Kalman filtering. + + Parameters + ---------- + kf : The Kalman filter. + cost_matrix : ndarray + The NxM dimensional cost matrix, where N is the number of track indices + and M is the number of detection indices, such that entry (i, j) is the + association cost between `tracks[track_indices[i]]` and + `detections[detection_indices[j]]`. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + gated_cost : Optional[float] + Entries in the cost matrix corresponding to infeasible associations are + set this value. Defaults to a very large value. + only_position : Optional[bool] + If True, only the x, y position of the state distribution is considered + during gating. Defaults to False. + + Returns + ------- + ndarray + Returns the modified cost matrix. + + """ + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray( + [detections[i].to_xyah() for i in detection_indices]) + for row, track_idx in enumerate(track_indices): + track = tracks[track_idx] + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position) + cost_matrix[row, gating_distance > gating_threshold] = gated_cost + return cost_matrix diff --git a/deep_sort/nn_matching.py b/deep_sort/nn_matching.py new file mode 100644 index 0000000..0f5f88f --- /dev/null +++ b/deep_sort/nn_matching.py @@ -0,0 +1,176 @@ +import numpy as np + + +def _pdist(a, b): + """Compute pair-wise squared distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + a, b = np.asarray(a), np.asarray(b) + if len(a) == 0 or len(b) == 0: + return np.zeros((len(a), len(b))) + a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) + r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] + r2 = np.clip(r2, 0., float(np.inf)) + return r2 + + +def _cosine_distance(a, b, data_is_normalized=False): + """Compute pair-wise cosine distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + data_is_normalized : Optional[bool] + If True, assumes rows in a and b are unit length vectors. + Otherwise, a and b are explicitly normalized to lenght 1. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + if not data_is_normalized: + a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) + b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) + return 1. - np.dot(a, b.T) + + +def _nn_euclidean_distance(x, y): + """ Helper function for nearest neighbor distance metric (Euclidean). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest Euclidean distance to a sample in `x`. + + """ + distances = _pdist(x, y) + return np.maximum(0.0, distances.min(axis=0)) + + +def _nn_cosine_distance(x, y): + """ Helper function for nearest neighbor distance metric (cosine). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest cosine distance to a sample in `x`. + + """ + distances = _cosine_distance(x, y) + return distances.min(axis=0) + + +class NearestNeighborDistanceMetric(object): + """ + A nearest neighbor distance metric that, for each target, returns + the closest distance to any sample that has been observed so far. + + Parameters + ---------- + metric : str + Either "euclidean" or "cosine". + matching_threshold: float + The matching threshold. Samples with larger distance are considered an + invalid match. + budget : Optional[int] + If not None, fix samples per class to at most this number. Removes + the oldest samples when the budget is reached. + + Attributes + ---------- + samples : Dict[int -> List[ndarray]] + A dictionary that maps from target identities to the list of samples + that have been observed so far. + + """ + + def __init__(self, metric, matching_threshold, budget=None): + + + if metric == "euclidean": + self._metric = _nn_euclidean_distance + elif metric == "cosine": + self._metric = _nn_cosine_distance + else: + raise ValueError( + "Invalid metric; must be either 'euclidean' or 'cosine'") + self.matching_threshold = matching_threshold + self.budget = budget + self.samples = {} + + def partial_fit(self, features, targets, active_targets): + """Update the distance metric with new data. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : ndarray + An integer array of associated target identities. + active_targets : List[int] + A list of targets that are currently present in the scene. + + """ + for feature, target in zip(features, targets): + self.samples.setdefault(target, []).append(feature) + if self.budget is not None: + self.samples[target] = self.samples[target][-self.budget:] + self.samples = {k: self.samples[k] for k in active_targets} + + def distance(self, features, targets): + """Compute distance between features and targets. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : List[int] + A list of targets to match the given `features` against. + + Returns + ------- + ndarray + Returns a cost matrix of shape len(targets), len(features), where + element (i, j) contains the closest squared distance between + `targets[i]` and `features[j]`. + + """ + cost_matrix = np.zeros((len(targets), len(features))) + for i, target in enumerate(targets): + cost_matrix[i, :] = self._metric(self.samples[target], features) + return cost_matrix diff --git a/deep_sort/preprocessing.py b/deep_sort/preprocessing.py new file mode 100644 index 0000000..98cf558 --- /dev/null +++ b/deep_sort/preprocessing.py @@ -0,0 +1,72 @@ +import cv2 +import numpy as np + + +def non_max_suppression(boxes, max_bbox_overlap, scores=None): + """Suppress overlapping detections. + + Original code from [1]_ has been adapted to include confidence score. + + .. [1] http://www.pyimagesearch.com/2015/02/16/ + faster-non-maximum-suppression-python/ + + Examples + -------- + + >>> boxes = [d.roi for d in detections] + >>> scores = [d.confidence for d in detections] + >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) + >>> detections = [detections[i] for i in indices] + + Parameters + ---------- + boxes : ndarray + Array of ROIs (x, y, width, height). + max_bbox_overlap : float + ROIs that overlap more than this values are suppressed. + scores : Optional[array_like] + Detector confidence score. + + Returns + ------- + List[int] + Returns indices of detections that have survived non-maxima suppression. + + """ + if len(boxes) == 0: + return [] + + boxes = boxes.astype(np.float) + pick = [] + + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + boxes[:, 0] + y2 = boxes[:, 3] + boxes[:, 1] + + area = (x2 - x1 + 1) * (y2 - y1 + 1) + if scores is not None: + idxs = np.argsort(scores) + else: + idxs = np.argsort(y2) + + while len(idxs) > 0: + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + + overlap = (w * h) / area[idxs[:last]] + + idxs = np.delete( + idxs, np.concatenate( + ([last], np.where(overlap > max_bbox_overlap)[0]))) + + return pick diff --git a/deep_sort/track.py b/deep_sort/track.py new file mode 100644 index 0000000..3a713fc --- /dev/null +++ b/deep_sort/track.py @@ -0,0 +1,163 @@ +class TrackState: + """ + Enumeration type for the single target track state. Newly created tracks are + classified as `tentative` until enough evidence has been collected. Then, + the track state is changed to `confirmed`. Tracks that are no longer alive + are classified as `deleted` to mark them for removal from the set of active + tracks. + + """ + + Tentative = 1 + Confirmed = 30 + Deleted = 60 + + +class Track: + """ + A single target track with state space `(x, y, a, h)` and associated + velocities, where `(x, y)` is the center of the bounding box, `a` is the + aspect ratio and `h` is the height. + + Parameters + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + max_age : int + The maximum number of consecutive misses before the track state is + set to `Deleted`. + feature : Optional[ndarray] + Feature vector of the detection this track originates from. If not None, + this feature is added to the `features` cache. + + Attributes + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + hits : int + Total number of measurement updates. + age : int + Total number of frames since first occurance. + time_since_update : int + Total number of frames since last measurement update. + state : TrackState + The current track state. + features : List[ndarray] + A cache of features. On each measurement update, the associated feature + vector is added to this list. + + """ + + def __init__(self, mean, covariance, track_id, n_init, max_age, + feature=None): + self.mean = mean + self.covariance = covariance + self.track_id = track_id + self.hits = 1 + self.age = 1 + self.time_since_update = 0 + + self.state = TrackState.Tentative + self.features = [] + if feature is not None: + self.features.append(feature) + + self._n_init = n_init + self._max_age = max_age + + def to_tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + + Returns + ------- + ndarray + The bounding box. + + """ + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + def to_tlbr(self): + """Get current position in bounding box format `(min x, miny, max x, + max y)`. + + Returns + ------- + ndarray + The bounding box. + + """ + ret = self.to_tlwh() + ret[2:] = ret[:2] + ret[2:] + return ret + + def predict(self, kf): + """Propagate the state distribution to the current time step using a + Kalman filter prediction step. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + + """ + self.mean, self.covariance = kf.predict(self.mean, self.covariance) + self.age += 1 + self.time_since_update += 1 + + def update(self, kf, detection): + """Perform Kalman filter measurement update step and update the feature + cache. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + detection : Detection + The associated detection. + + """ + self.mean, self.covariance = kf.update( + self.mean, self.covariance, detection.to_xyah()) + self.features.append(detection.feature) + + self.hits += 1 + self.time_since_update = 0 + if self.state == TrackState.Tentative and self.hits >= self._n_init: + self.state = TrackState.Confirmed + + def mark_missed(self): + """Mark this track as missed (no association at the current time step). + """ + if self.state == TrackState.Tentative: + self.state = TrackState.Deleted + elif self.time_since_update > self._max_age: + self.state = TrackState.Deleted + + def is_tentative(self): + """Returns True if this track is tentative (unconfirmed). + """ + return self.state == TrackState.Tentative + + def is_confirmed(self): + """Returns True if this track is confirmed.""" + return self.state == TrackState.Confirmed + + def is_deleted(self): + """Returns True if this track is dead and should be deleted.""" + return self.state == TrackState.Deleted diff --git a/deep_sort/tracker.py b/deep_sort/tracker.py new file mode 100644 index 0000000..561a236 --- /dev/null +++ b/deep_sort/tracker.py @@ -0,0 +1,139 @@ +from __future__ import absolute_import + +import numpy as np + +from .track import Track +from . import iou_matching +from . import kalman_filter +from . import linear_assignment + + +class Tracker: + """ + This is the multi-target tracker. + + Parameters + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + A distance metric for measurement-to-track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + + Attributes + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + The distance metric used for measurement to track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of frames that a track remains in initialization phase. + kf : kalman_filter.KalmanFilter + A Kalman filter to filter target trajectories in image space. + tracks : List[Track] + The list of active tracks at the current time step. + + """ + + def __init__(self, metric, max_iou_distance=0.7, max_age=90, n_init=15): + self.metric = metric + self.max_iou_distance = max_iou_distance + self.max_age = max_age + self.n_init = n_init + + self.kf = kalman_filter.KalmanFilter() + self.tracks = [] + self._next_id = 0 + + def predict(self): + """Propagate track state distributions one time step forward. + + This function should be called once every time step, before `update`. + """ + for track in self.tracks: + track.predict(self.kf) + + def update(self, detections): + """Perform measurement update and track management. + + Parameters + ---------- + detections : List[deep_sort.detection.Detection] + A list of detections at the current time step. + + """ + # Run matching cascade. + matches, unmatched_tracks, unmatched_detections = \ + self._match(detections) + + # Update track set. + for track_idx, detection_idx in matches: + self.tracks[track_idx].update( + self.kf, detections[detection_idx]) + for track_idx in unmatched_tracks: + self.tracks[track_idx].mark_missed() + for detection_idx in unmatched_detections: + self._initiate_track(detections[detection_idx]) + self.tracks = [t for t in self.tracks if not t.is_deleted()] + + # Update distance metric. + active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] + features, targets = [], [] + for track in self.tracks: + if not track.is_confirmed(): + continue + features += track.features + targets += [track.track_id for _ in track.features] + track.features = [] + self.metric.partial_fit( + np.asarray(features), np.asarray(targets), active_targets) + + def _match(self, detections): + + def gated_metric(tracks, dets, track_indices, detection_indices): + features = np.array([dets[i].feature for i in detection_indices]) + targets = np.array([tracks[i].track_id for i in track_indices]) + cost_matrix = self.metric.distance(features, targets) + cost_matrix = linear_assignment.gate_cost_matrix( + self.kf, cost_matrix, tracks, dets, track_indices, + detection_indices) + + return cost_matrix + + # Split track set into confirmed and unconfirmed tracks. + confirmed_tracks = [ + i for i, t in enumerate(self.tracks) if t.is_confirmed()] + unconfirmed_tracks = [ + i for i, t in enumerate(self.tracks) if not t.is_confirmed()] + + # Associate confirmed tracks using appearance features. + matches_a, unmatched_tracks_a, unmatched_detections = \ + linear_assignment.matching_cascade( + gated_metric, self.metric.matching_threshold, self.max_age, + self.tracks, detections, confirmed_tracks) + + # Associate remaining tracks together with unconfirmed tracks using IOU. + iou_track_candidates = unconfirmed_tracks + [ + k for k in unmatched_tracks_a if + self.tracks[k].time_since_update == 1] + unmatched_tracks_a = [ + k for k in unmatched_tracks_a if + self.tracks[k].time_since_update != 1] + matches_b, unmatched_tracks_b, unmatched_detections = \ + linear_assignment.min_cost_matching( + iou_matching.iou_cost, self.max_iou_distance, self.tracks, + detections, iou_track_candidates, unmatched_detections) + + matches = matches_a + matches_b + unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) + return matches, unmatched_tracks, unmatched_detections + + def _initiate_track(self, detection): + mean, covariance = self.kf.initiate(detection.to_xyah()) + self.tracks.append(Track( + mean, covariance, self._next_id, self.n_init, self.max_age, + detection.feature)) + self._next_id += 1 diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..33e6333 --- /dev/null +++ b/demo.py @@ -0,0 +1,168 @@ +from __future__ import division, print_function, absolute_import + +import os +import argparse +import warnings + +import cv2 +import numpy as np +from PIL import Image +from yolo3.yolo import YOLO + +from deep_sort import nn_matching +from deep_sort import preprocessing +from deep_sort.tracker import Tracker +from deep_sort.detection import Detection +from tools import generate_detections as gdet +warnings.filterwarnings('ignore') + + +def file_system_work(videofile, out_root_dir): + """Create output directories and files. + """ + videofile_name = videofile.split('/')[-1].split('.')[0] + out_dir = os.path.join(out_root_dir, videofile_name) + + # create directory for output + if not os.path.exists(out_root_dir): + os.makedirs(out_root_dir) + + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + out_video_file_name = os.path.join(out_dir, 'RESULT_' + videofile_name) + out_list_file_name = os.path.join(out_dir, 'DETECTION_LIST_RESULT_' + videofile_name) + + return out_video_file_name, out_list_file_name + + +def main(detector, videofile='input/real.MOV', out_root_dir='output', + process_stream=False, writeVideo_flag = True, show_detections=False): + # Definition of the parameters + max_cosine_distance = 0.3 + nn_budget = None + nms_max_overlap = 1.0 + + # deep_sort + model_filename = 'model_data/mars-small128.pb' + encoder = gdet.create_box_encoder(model_filename, batch_size=1) + + metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) + tracker = Tracker(metric) + tracks_ids = [] + + if process_stream: + print("SOURCE: Stream is processing.") + video_capture = cv2.VideoCapture(0) + else: + print("SOURCE: File {} is processing.".format(videofile)) + video_capture = cv2.VideoCapture(videofile) + + if writeVideo_flag: + # Define the codec and create VideoWriter object + w = int(video_capture.get(3)) + h = int(video_capture.get(4)) + fourcc = cv2.VideoWriter_fourcc(*'MJPG') + out_video_file_name, out_list_file_name = file_system_work(videofile, out_root_dir) + + out = cv2.VideoWriter(out_video_file_name, fourcc, 15, (w, h)) + list_file = open(out_list_file_name, 'w') + frame_index = -1 + + print('EXECUTION: Processing...') + print('EXECUTION: Press Q to stop execution.') + while video_capture.isOpened(): + ret, frame = video_capture.read() # frame shape 640*480*3 + + if not ret: + break + + image = Image.fromarray(frame[...,::-1]) # bgr to rgb + + boxs = detector.detect_image(image) + features = encoder(frame,boxs) + + # score to 1.0 here). + detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features)] + + # Run non-maximum suppression. + boxes = np.array([d.tlwh for d in detections]) + scores = np.array([d.confidence for d in detections]) + indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores) + detections = [detections[i] for i in indices] + + # Call the tracker + tracker.predict() + tracker.update(detections) + + for track in tracker.tracks: + if not track.is_confirmed() or track.time_since_update > 1: + continue + + bbox = track.to_tlbr() + cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,255,255), 2) + if track.track_id not in tracks_ids: + tracks_ids.append(track.track_id) + + cv2.putText(frame, str(track.track_id),(int(bbox[0]), int(bbox[1])),0, 5e-3 * 200, (0,255,0),2) + + if show_detections: + cv2.imshow('', frame) + + if writeVideo_flag: + # save a frame + out.write(frame) + frame_index = frame_index + 1 + list_file.write(str(frame_index)+' ') + if len(boxs) != 0: + for i in range(0,len(boxs)): + list_file.write(str(boxs[i][0]) + ' ' + str(boxs[i][1]) + ' ' + str(boxs[i][2]) + ' ' + str(boxs[i][3]) + ' ') + + list_file.write('\n') + + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + print('############ RESULT ###################') + print('RESULT: Number of tracks = ', len(tracks_ids)) + print('############ RESULT ###################') + # end processing and write + video_capture.release() + + if writeVideo_flag: + out.release() + list_file.close() + + cv2.destroyAllWindows() + + +def parse_args(): + """Parse command line arguments. + """ + parser = argparse.ArgumentParser(description="People counter") + + parser.add_argument("--videofile", default="input/real.MOV", + help="Path to file which you want to process.", required=True) + parser.add_argument("--out_root_dir", default="output", + help="Directory for output.", required=True) + parser.add_argument("--process_stream", default=False, + help="If True then read video from camera else process file", required=True) + parser.add_argument("--writeVideo_flag", default=True, + help="If True then write detections on output video else don't", required=True) + parser.add_argument("--show_detections", default=False, + help="If True display detections on each frame of video, else don't. " + "NOTE: if you run on server it has to be FALSE", required=True) + + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + + detector = YOLO() + main(detector, + args.videofile, + args.out_root_dir, + args.process_stream, + args.writeVideo_flag, + args.show_detections) diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/generate_detections.py b/tools/generate_detections.py new file mode 100644 index 0000000..4c47d2e --- /dev/null +++ b/tools/generate_detections.py @@ -0,0 +1,181 @@ +# vim: expandtab:ts=4:sw=4 +import os +import errno +import argparse +import numpy as np +import cv2 +import tensorflow as tf + + +def _run_in_batches(f, data_dict, out, batch_size): + data_len = len(out) + num_batches = int(data_len / batch_size) + + s, e = 0, 0 + for i in range(num_batches): + s, e = i * batch_size, (i + 1) * batch_size + batch_data_dict = {k: v[s:e] for k, v in data_dict.items()} + out[s:e] = f(batch_data_dict) + if e < len(out): + batch_data_dict = {k: v[e:] for k, v in data_dict.items()} + out[e:] = f(batch_data_dict) + + +def extract_image_patch(image, bbox, patch_shape): + """Extract image patch from bounding box. + + Parameters + ---------- + image : ndarray + The full image. + bbox : array_like + The bounding box in format (x, y, width, height). + patch_shape : Optional[array_like] + This parameter can be used to enforce a desired patch shape + (height, width). First, the `bbox` is adapted to the aspect ratio + of the patch shape, then it is clipped at the image boundaries. + If None, the shape is computed from :arg:`bbox`. + + Returns + ------- + ndarray | NoneType + An image patch showing the :arg:`bbox`, optionally reshaped to + :arg:`patch_shape`. + Returns None if the bounding box is empty or fully outside of the image + boundaries. + + """ + bbox = np.array(bbox) + if patch_shape is not None: + # correct aspect ratio to patch shape + target_aspect = float(patch_shape[1]) / patch_shape[0] + new_width = target_aspect * bbox[3] + bbox[0] -= (new_width - bbox[2]) / 2 + bbox[2] = new_width + + # convert to top left, bottom right + bbox[2:] += bbox[:2] + bbox = bbox.astype(np.int) + + # clip at image boundaries + bbox[:2] = np.maximum(0, bbox[:2]) + bbox[2:] = np.minimum(np.asarray(image.shape[:2][::-1]) - 1, bbox[2:]) + if np.any(bbox[:2] >= bbox[2:]): + return None + sx, sy, ex, ey = bbox + image = image[sy:ey, sx:ex] + image = cv2.resize(image, tuple(patch_shape[::-1])) + return image + + +class ImageEncoder(object): + + def __init__(self, checkpoint_filename, input_name="images", + output_name="features"): + self.session = tf.Session() + with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle: + graph_def = tf.GraphDef() + graph_def.ParseFromString(file_handle.read()) + tf.import_graph_def(graph_def, name="net") + self.input_var = tf.get_default_graph().get_tensor_by_name( + "net/%s:0" % input_name) + self.output_var = tf.get_default_graph().get_tensor_by_name( + "net/%s:0" % output_name) + + assert len(self.output_var.get_shape()) == 2 + assert len(self.input_var.get_shape()) == 4 + self.feature_dim = self.output_var.get_shape().as_list()[-1] + self.image_shape = self.input_var.get_shape().as_list()[1:] + + def __call__(self, data_x, batch_size=32): + out = np.zeros((len(data_x), self.feature_dim), np.float32) + _run_in_batches( + lambda x: self.session.run(self.output_var, feed_dict=x), + {self.input_var: data_x}, out, batch_size) + return out + + +def create_box_encoder(model_filename, input_name="images", + output_name="features", batch_size=32): + image_encoder = ImageEncoder(model_filename, input_name, output_name) + image_shape = image_encoder.image_shape + + def encoder(image, boxes): + image_patches = [] + for box in boxes: + patch = extract_image_patch(image, box, image_shape[:2]) + if patch is None: + print("WARNING: Failed to extract image patch: %s." % str(box)) + patch = np.random.uniform( + 0., 255., image_shape).astype(np.uint8) + image_patches.append(patch) + image_patches = np.asarray(image_patches) + return image_encoder(image_patches, batch_size) + + return encoder + + +def generate_detections(encoder, mot_dir, output_dir, detection_dir=None): + """Generate detections with features. + + Parameters + ---------- + encoder : Callable[image, ndarray] -> ndarray + The encoder function takes as input a BGR color image and a matrix of + bounding boxes in format `(x, y, w, h)` and returns a matrix of + corresponding feature vectors. + mot_dir : str + Path to the MOTChallenge directory (can be either train or test). + output_dir + Path to the output directory. Will be created if it does not exist. + detection_dir + Path to custom detections. The directory structure should be the default + MOTChallenge structure: `[sequence]/det/det.txt`. If None, uses the + standard MOTChallenge detections. + + """ + if detection_dir is None: + detection_dir = mot_dir + try: + os.makedirs(output_dir) + except OSError as exception: + if exception.errno == errno.EEXIST and os.path.isdir(output_dir): + pass + else: + raise ValueError( + "Failed to created output directory '%s'" % output_dir) + + for sequence in os.listdir(mot_dir): + print("Processing %s" % sequence) + sequence_dir = os.path.join(mot_dir, sequence) + + image_dir = os.path.join(sequence_dir, "img1") + image_filenames = { + int(os.path.splitext(f)[0]): os.path.join(image_dir, f) + for f in os.listdir(image_dir)} + + detection_file = os.path.join( + detection_dir, sequence, "det/det.txt") + detections_in = np.loadtxt(detection_file, delimiter=',') + detections_out = [] + + frame_indices = detections_in[:, 0].astype(np.int) + min_frame_idx = frame_indices.astype(np.int).min() + max_frame_idx = frame_indices.astype(np.int).max() + for frame_idx in range(min_frame_idx, max_frame_idx + 1): + print("Frame %05d/%05d" % (frame_idx, max_frame_idx)) + mask = frame_indices == frame_idx + rows = detections_in[mask] + + if frame_idx not in image_filenames: + print("WARNING could not find image for frame %d" % frame_idx) + continue + bgr_image = cv2.imread( + image_filenames[frame_idx], cv2.IMREAD_COLOR) + features = encoder(bgr_image, rows[:, 2:6].copy()) + detections_out += [np.r_[(row, feature)] for row, feature + in zip(rows, features)] + + output_filename = os.path.join(output_dir, "%s.npy" % sequence) + np.save( + output_filename, np.asarray(detections_out), allow_pickle=False) diff --git a/yolo3/__init__.py b/yolo3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/yolo3/model.py b/yolo3/model.py new file mode 100644 index 0000000..e890e50 --- /dev/null +++ b/yolo3/model.py @@ -0,0 +1,373 @@ +"""YOLO_v3 Model Defined in Keras.""" + +from functools import wraps + +import numpy as np +import tensorflow as tf +from keras import backend as K + +from keras.models import Model +from keras.regularizers import l2 +from keras.layers.advanced_activations import LeakyReLU +from keras.layers.normalization import BatchNormalization +from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate + +from yolo3.utils import compose + + +@wraps(Conv2D) +def DarknetConv2D(*args, **kwargs): + """Wrapper to set Darknet parameters for Convolution2D.""" + darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)} + darknet_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same' + darknet_conv_kwargs.update(kwargs) + return Conv2D(*args, **darknet_conv_kwargs) + + +def DarknetConv2D_BN_Leaky(*args, **kwargs): + """Darknet Convolution2D followed by BatchNormalization and LeakyReLU.""" + no_bias_kwargs = {'use_bias': False} + no_bias_kwargs.update(kwargs) + return compose( + DarknetConv2D(*args, **no_bias_kwargs), + BatchNormalization(), + LeakyReLU(alpha=0.1)) + + +def resblock_body(x, num_filters, num_blocks): + '''A series of resblocks starting with a downsampling Convolution2D''' + # Darknet uses left and top padding instead of 'same' mode + x = ZeroPadding2D(((1,0),(1,0)))(x) + x = DarknetConv2D_BN_Leaky(num_filters, (3,3), strides=(2,2))(x) + for i in range(num_blocks): + y = compose( + DarknetConv2D_BN_Leaky(num_filters//2, (1,1)), + DarknetConv2D_BN_Leaky(num_filters, (3,3)))(x) + x = Add()([x,y]) + return x + + +def darknet_body(x): + '''Darknent body having 52 Convolution2D layers''' + x = DarknetConv2D_BN_Leaky(32, (3,3))(x) + x = resblock_body(x, 64, 1) + x = resblock_body(x, 128, 2) + x = resblock_body(x, 256, 8) + x = resblock_body(x, 512, 8) + x = resblock_body(x, 1024, 4) + return x + + +def make_last_layers(x, num_filters, out_filters): + '''6 Conv2D_BN_Leaky layers followed by a Conv2D_linear layer''' + x = compose( + DarknetConv2D_BN_Leaky(num_filters, (1,1)), + DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), + DarknetConv2D_BN_Leaky(num_filters, (1,1)), + DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), + DarknetConv2D_BN_Leaky(num_filters, (1,1)))(x) + y = compose( + DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), + DarknetConv2D(out_filters, (1,1)))(x) + return x, y + + +def yolo_body(inputs, num_anchors, num_classes): + """Create YOLO_V3 model CNN body in Keras.""" + darknet = Model(inputs, darknet_body(inputs)) + x, y1 = make_last_layers(darknet.output, 512, num_anchors*(num_classes+5)) + + x = compose( + DarknetConv2D_BN_Leaky(256, (1,1)), + UpSampling2D(2))(x) + x = Concatenate()([x,darknet.layers[152].output]) + x, y2 = make_last_layers(x, 256, num_anchors*(num_classes+5)) + + x = compose( + DarknetConv2D_BN_Leaky(128, (1,1)), + UpSampling2D(2))(x) + x = Concatenate()([x,darknet.layers[92].output]) + x, y3 = make_last_layers(x, 128, num_anchors*(num_classes+5)) + + return Model(inputs, [y1,y2,y3]) + + +def yolo_head(feats, anchors, num_classes, input_shape): + """Convert final layer features to bounding box parameters.""" + num_anchors = len(anchors) + # Reshape to batch, height, width, num_anchors, box_params. + anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) + + grid_shape = K.shape(feats)[1:3] # height, width + grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), + [1, grid_shape[1], 1, 1]) + grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), + [grid_shape[0], 1, 1, 1]) + grid = K.concatenate([grid_x, grid_y]) + grid = K.cast(grid, K.dtype(feats)) + + feats = K.reshape( + feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) + + box_xy = K.sigmoid(feats[..., :2]) + box_wh = K.exp(feats[..., 2:4]) + box_confidence = K.sigmoid(feats[..., 4:5]) + box_class_probs = K.sigmoid(feats[..., 5:]) + + # Adjust preditions to each spatial grid point and anchor size. + box_xy = (box_xy + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) + box_wh = box_wh * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) + + return box_xy, box_wh, box_confidence, box_class_probs + + +def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape): + '''Get corrected boxes''' + box_yx = box_xy[..., ::-1] + box_hw = box_wh[..., ::-1] + input_shape = K.cast(input_shape, K.dtype(box_yx)) + image_shape = K.cast(image_shape, K.dtype(box_yx)) + new_shape = K.round(image_shape * K.min(input_shape/image_shape)) + offset = (input_shape-new_shape)/2./input_shape + scale = input_shape/new_shape + box_yx = (box_yx - offset) * scale + box_hw *= scale + + box_mins = box_yx - (box_hw / 2.) + box_maxes = box_yx + (box_hw / 2.) + boxes = K.concatenate([ + box_mins[..., 0:1], # y_min + box_mins[..., 1:2], # x_min + box_maxes[..., 0:1], # y_max + box_maxes[..., 1:2] # x_max + ]) + + # Scale boxes back to original image shape. + boxes *= K.concatenate([image_shape, image_shape]) + return boxes + + +def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape): + '''Process Conv layer output''' + box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, + anchors, num_classes, input_shape) + boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape) + boxes = K.reshape(boxes, [-1, 4]) + box_scores = box_confidence * box_class_probs + box_scores = K.reshape(box_scores, [-1, num_classes]) + return boxes, box_scores + + +def yolo_eval(yolo_outputs, + anchors, + num_classes, + image_shape, + max_boxes=20, + score_threshold=.6, + iou_threshold=.5): + """Evaluate YOLO model on given input and return filtered boxes.""" + anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] + input_shape = K.shape(yolo_outputs[0])[1:3] * 32 + boxes = [] + box_scores = [] + for l in range(3): + _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], + anchors[anchor_mask[l]], num_classes, input_shape, image_shape) + boxes.append(_boxes) + box_scores.append(_box_scores) + boxes = K.concatenate(boxes, axis=0) + box_scores = K.concatenate(box_scores, axis=0) + + mask = box_scores >= score_threshold + max_boxes_tensor = K.constant(max_boxes, dtype='int32') + boxes_ = [] + scores_ = [] + classes_ = [] + for c in range(num_classes): + class_boxes = tf.boolean_mask(boxes, mask[:, c]) + class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c]) + nms_index = tf.image.non_max_suppression( + class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold) + class_boxes = K.gather(class_boxes, nms_index) + class_box_scores = K.gather(class_box_scores, nms_index) + classes = K.ones_like(class_box_scores, 'int32') * c + boxes_.append(class_boxes) + scores_.append(class_box_scores) + classes_.append(classes) + boxes_ = K.concatenate(boxes_, axis=0) + scores_ = K.concatenate(scores_, axis=0) + classes_ = K.concatenate(classes_, axis=0) + + return boxes_, scores_, classes_ + + +def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes): + '''Preprocess true boxes to training input format + + Parameters + ---------- + true_boxes: array, shape=(m, T, 5) + Absolute x_min, y_min, x_max, y_max, class_code reletive to input_shape. + input_shape: array-like, hw, multiples of 32 + anchors: array, shape=(N, 2), wh + num_classes: integer + + Returns + ------- + y_true: list of array, shape like yolo_outputs, xywh are reletive value + + ''' + anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] + + true_boxes = np.array(true_boxes, dtype='float32') + input_shape = np.array(input_shape, dtype='int32') + boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2 + boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2] + true_boxes[..., 0:2] = boxes_xy/input_shape[::-1] + true_boxes[..., 2:4] = boxes_wh/input_shape[::-1] + + m = true_boxes.shape[0] + grid_shapes = [input_shape//{0:32, 1:16, 2:8}[l] for l in range(3)] + y_true = [np.zeros((m,grid_shapes[l][0],grid_shapes[l][1],len(anchor_mask[l]),5+num_classes), + dtype='float32') for l in range(3)] + + # Expand dim to apply broadcasting. + anchors = np.expand_dims(anchors, 0) + anchor_maxes = anchors / 2. + anchor_mins = -anchor_maxes + valid_mask = boxes_wh[..., 0]>0 + + for b in range(m): + # Discard zero rows. + wh = boxes_wh[b, valid_mask[b]] + # Expand dim to apply broadcasting. + wh = np.expand_dims(wh, -2) + box_maxes = wh / 2. + box_mins = -box_maxes + + intersect_mins = np.maximum(box_mins, anchor_mins) + intersect_maxes = np.minimum(box_maxes, anchor_maxes) + intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.) + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + box_area = wh[..., 0] * wh[..., 1] + anchor_area = anchors[..., 0] * anchors[..., 1] + iou = intersect_area / (box_area + anchor_area - intersect_area) + + # Find best anchor for each true box + best_anchor = np.argmax(iou, axis=-1) + + for t, n in enumerate(best_anchor): + for l in range(3): + if n in anchor_mask[l]: + i = np.floor(true_boxes[b,t,0]*grid_shapes[l][1]).astype('int32') + j = np.floor(true_boxes[b,t,1]*grid_shapes[l][0]).astype('int32') + n = anchor_mask[l].index(n) + c = true_boxes[b,t, 4].astype('int32') + y_true[l][b, j, i, n, 0:4] = true_boxes[b,t, 0:4] + y_true[l][b, j, i, n, 4] = 1 + y_true[l][b, j, i, n, 5+c] = 1 + break + + return y_true + + +def box_iou(b1, b2): + '''Return iou tensor + + Parameters + ---------- + b1: tensor, shape=(i1,...,iN, 4), xywh + b2: tensor, shape=(j, 4), xywh + + Returns + ------- + iou: tensor, shape=(i1,...,iN, j) + + ''' + + # Expand dim to apply broadcasting. + b1 = K.expand_dims(b1, -2) + b1_xy = b1[..., :2] + b1_wh = b1[..., 2:4] + b1_wh_half = b1_wh/2. + b1_mins = b1_xy - b1_wh_half + b1_maxes = b1_xy + b1_wh_half + + # Expand dim to apply broadcasting. + b2 = K.expand_dims(b2, 0) + b2_xy = b2[..., :2] + b2_wh = b2[..., 2:4] + b2_wh_half = b2_wh/2. + b2_mins = b2_xy - b2_wh_half + b2_maxes = b2_xy + b2_wh_half + + intersect_mins = K.maximum(b1_mins, b2_mins) + intersect_maxes = K.minimum(b1_maxes, b2_maxes) + intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + b1_area = b1_wh[..., 0] * b1_wh[..., 1] + b2_area = b2_wh[..., 0] * b2_wh[..., 1] + iou = intersect_area / (b1_area + b2_area - intersect_area) + + return iou + + +def yolo_loss(args, anchors, num_classes, ignore_thresh=.5): + '''Return yolo_loss tensor + + Parameters + ---------- + yolo_outputs: list of tensor, the output of yolo_body + y_true: list of array, the output of preprocess_true_boxes + anchors: array, shape=(T, 2), wh + num_classes: integer + ignore_thresh: float, the iou threshold whether to ignore object confidence loss + + Returns + ------- + loss: tensor, shape=(1,) + + ''' + yolo_outputs = args[:3] + y_true = args[3:] + anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] + input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) + grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(3)] + loss = 0 + m = K.shape(yolo_outputs[0])[0] + + for l in range(3): + object_mask = y_true[l][..., 4:5] + true_class_probs = y_true[l][..., 5:] + + pred_xy, pred_wh, pred_confidence, pred_class_probs = yolo_head(yolo_outputs[l], + anchors[anchor_mask[l]], num_classes, input_shape) + pred_box = K.concatenate([pred_xy, pred_wh]) + + # Darknet box loss. + xy_delta = (y_true[l][..., :2]-pred_xy)*grid_shapes[l][::-1] + wh_delta = K.log(y_true[l][..., 2:4]) - K.log(pred_wh) + # Avoid log(0)=-inf. + wh_delta = K.switch(object_mask, wh_delta, K.zeros_like(wh_delta)) + box_delta = K.concatenate([xy_delta, wh_delta], axis=-1) + box_delta_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4] + + # Find ignore mask, iterate over each of batch. + ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) + object_mask_bool = K.cast(object_mask, 'bool') + def loop_body(b, ignore_mask): + true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) + iou = box_iou(pred_box[b], true_box) + best_iou = K.max(iou, axis=-1) + ignore_mask = ignore_mask.write(b, K.cast(best_iou