mjhydri · nikhilsos · May 27, 2024 · May 27, 2024 · May 27, 2024 · May 28, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+cython
+librosa>=0.8.0
+numba==0.54.1
+scipy
+mido>=1.2.6
+pytest
+madmom
+torch
+Matplotlib
diff --git a/src/BeatNet/BeatNet.py b/src/BeatNet/BeatNet.py
@@ -9,11 +9,11 @@
 import torch
 import numpy as np
 from madmom.features import DBNDownBeatTrackingProcessor
-from BeatNet.particle_filtering_cascade import particle_filter_cascade
-from BeatNet.log_spect import LOG_SPECT
+from particle_filtering_cascade import particle_filter_cascade
+from log_spect import LOG_SPECT
 import librosa
 import sys
-from BeatNet.model import BDA
+from model import BDA
 import pyaudio
 import matplotlib.pyplot as plt
 import time
@@ -89,8 +89,8 @@ def __init__(self, model, mode='online', inference_model='PF', plot=[], thread=F
                                              rate=self.sample_rate,
                                              input=True,
                                              frames_per_buffer=self.log_spec_hop_length,)
-
-    def process(self, audio_path=None):   
+    # gives output from self.pred                                         
+    def process(self, audio_path=None):   # takes audio path
         if self.mode == "stream":
             if self.inference_model != "PF":
                     raise RuntimeError('The infernece model should be set to "PF" for the streaming mode!')
@@ -138,7 +138,7 @@ def process(self, audio_path=None):
                 output = self.estimator(preds)  # Using DBN offline inference to infer beat/downbeats
                 return output
 
-        
+
         elif self.mode == "offline":
                 if self.inference_model != "DBN":
                     raise RuntimeError('The infernece model should be set to "DBN" for the offline mode!')
@@ -150,7 +150,7 @@ def process(self, audio_path=None):
                 else:
                     raise RuntimeError('An audio object or file directory is required for the offline usage!')
 
-
+    
     def activation_extractor_stream(self):
         # TODO: 
         ''' Streaming window
@@ -164,15 +164,25 @@ def activation_extractor_stream(self):
                 self.pred = np.zeros([1,2])
             else:
                 feats = self.proc.process_audio(self.stream_window).T[-1]
+                print(feats.shape,'is the shape of feats in model after it is fed to the model, extracting last')
                 feats = torch.from_numpy(feats)
                 feats = feats.unsqueeze(0).unsqueeze(0).to(self.device)
+                print(feats.shape,'is the shape of featsthat the DL model takes while loading to model')
                 pred = self.model(feats)[0]
+                print(pred.shape,'is the shape of pred after model, 0 is taken')
                 pred = self.model.final_pred(pred)
                 pred = pred.cpu().detach().numpy()
+                print(pred.shape,'is the shape of pred after detaching converted to numpy')
                 self.pred = np.transpose(pred[:2, :])
 
 
-    def activation_extractor_realtime(self, audio_path):
+    def activation_extractor_realtime(self, audio_path: str) -> None:
+        '''
+        Extracts activations from the audio data in real-time.
+
+        Parameters:
+            audio_path (str): Path to the audio file.
+        '''
         with torch.no_grad():
             if self.counter==0: #loading the audio
                 if isinstance(audio_path, str):
@@ -196,20 +206,34 @@ def activation_extractor_realtime(self, audio_path):
                 self.completed = 1
 
 
-    def activation_extractor_online(self, audio_path):
+
+    def activation_extractor_online(self, audio_path: str) -> np.ndarray:
+        '''
+        Extracts activations from the audio data online.
+
+        Parameters:
+            audio_path (str): Path to the audio file.
+
+        Returns:
+            np.ndarray: A numpy array containing the extracted activations.
+        '''
         with torch.no_grad():
             if isinstance(audio_path, str):
-            	audio, _ = librosa.load(audio_path, sr=self.sample_rate)  # reading the data
-            elif len(np.shape(audio_path))>1:
-                audio = np.mean(audio_path ,axis=1)
+                audio, _ = librosa.load(audio_path, sr=self.sample_rate)  # reading the data
+            elif len(np.shape(audio_path)) > 1:
+                audio = np.mean(audio_path, axis=1)
             else:
                 audio = audio_path
             feats = self.proc.process_audio(audio).T
             feats = torch.from_numpy(feats)
             feats = feats.unsqueeze(0).to(self.device)
+            print(feats.shape,'is the shape of feats while loading to model')
             preds = self.model(feats)[0]  # extracting the activations by passing the feature through the NN
             preds = self.model.final_pred(preds)
             preds = preds.cpu().detach().numpy()
+            print(preds.shape,'is the shape of preds after detaching')
+
             preds = np.transpose(preds[:2, :])
+            print(preds.shape,'is the shape of preds after transposing, it is final')
         return preds
 
diff --git a/src/BeatNet/__pycache__/BeatNet.cpython-38.pyc b/src/BeatNet/__pycache__/BeatNet.cpython-38.pyc
diff --git a/src/BeatNet/__pycache__/common.cpython-38.pyc b/src/BeatNet/__pycache__/common.cpython-38.pyc
diff --git a/src/BeatNet/__pycache__/log_spect.cpython-38.pyc b/src/BeatNet/__pycache__/log_spect.cpython-38.pyc
diff --git a/src/BeatNet/__pycache__/model.cpython-38.pyc b/src/BeatNet/__pycache__/model.cpython-38.pyc
diff --git a/src/BeatNet/__pycache__/particle_filtering_cascade.cpython-38.pyc b/src/BeatNet/__pycache__/particle_filtering_cascade.cpython-38.pyc
diff --git a/src/BeatNet/dataloader.py b/src/BeatNet/dataloader.py
@@ -0,0 +1,106 @@
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import os
+import librosa  # Ensure librosa is imported
+from common import *
+from log_spect import LOG_SPECT
+
+class BeatNetDataset(Dataset):
+    def __init__(self, audio_dir, target_dir):
+        self.audio_dir = audio_dir
+        self.target_dir = target_dir
+
+        self.audio_path = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]
+        self.target_path = [os.path.join(target_dir, f) for f in os.listdir(target_dir) if f.endswith('.beats')]
+
+        if len(self.audio_path) != len(self.target_path):
+            raise ValueError('Number of audio files and target files do not match')
+
+
+
+
+        self.data_names = self._get_data_list()
+        self.sample_rate = 22050
+        self.log_spec_sample_rate = self.sample_rate
+        self.log_spec_hop_length = int(20 * 0.001 * self.log_spec_sample_rate)
+        self.log_spec_win_length = int(64 * 0.001 * self.log_spec_sample_rate)
+
+        self.proc = LOG_SPECT(sample_rate=self.log_spec_sample_rate, win_length=self.log_spec_win_length,
+                             hop_size=self.log_spec_hop_length, n_bands=[24], mode = 'online')
+
+    def __len__(self):
+        return len(self.audio_path)
+
+    def __getitem__(self, idx):
+        data = self._get_data(self.audio_path[idx])
+        target = self._get_targets(self.target_path[idx])
+        return data, target
+
+    def _get_data(self, audio_path):
+        audio, _ = librosa.load(audio_path, sr=self.sample_rate)
+        if audio.ndim > 1:
+            audio = np.mean(audio, axis=1)
+        feats = self.proc.process_audio(audio).T
+        feats = torch.from_numpy(feats)
+        feats = feats.unsqueeze(0) # Assuming you want a 4D tensor [1, C, H, W]
+        return feats
+
+    def _get_targets(self, target_path):
+        target_list = []
+        with open(target_path, 'r') as f:
+            for line in f:
+                parsed = self._text_label_to_float(line)
+                target_list.append(parsed)
+
+        # Using the shape of features obtained from the first audio file
+        sample_feats = self._get_data(self.audio_path[0])
+        beat_vector = np.zeros((sample_feats.shape[1], 3))
+
+        beat_times = np.array([x[0] for x in target_list]) * self.sample_rate
+
+        for time in beat_times:
+            spec_frame = min(int(time / self.log_spec_hop_length), beat_vector.shape[0] - 1)
+            for n in range(-2, 3):
+                if 0 <= spec_frame + n < beat_vector.shape[0]:
+                    beat_vector[spec_frame + n] = 1.0 if n == 0 else 0.5
+
+        return torch.tensor(beat_vector)
+
+    def _get_data_list(self):
+        names = []
+        for entry in os.scandir(self.target_dir):
+            names.append(os.path.splitext(entry.name)[0])
+        return names
+
+    def _text_label_to_float(self, text):
+        allowed = '1234567890. \t'
+        filtered = ''.join([c for c in text if c in allowed])
+        if '\t' in filtered:
+            t = filtered.rstrip('\n').split('\t')
+        else:
+            t = filtered.rstrip('\n').split(' ')
+        return float(t[0]), float(t[1])
+
+if __name__ == '__main__':
+    # Test dataloader
+    audio_dir = '/home/nikhil/moji/BeatNet/test/test_data/wav'
+    target_dir = '/home/nikhil/moji/BeatNet/test/test_data/beats'
+
+    try:
+        dataset = BeatNetDataset(audio_dir, target_dir)
+
+        # Fetch the first sample
+        sample_data, sample_target = dataset[2]
+
+        # Print data and target shapes
+        print('Input data shape:', sample_data.shape)
+        print('Target shape:', sample_target.shape)
+        print('Target:', sample_target)
+        print('Data', sample_data)
+
+        # Print the length of the dataset
+        print('Dataset length:', len(dataset))
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
diff --git a/src/BeatNet/log_spect.py b/src/BeatNet/log_spect.py
@@ -1,43 +1,108 @@
-# Author: Mojtaba Heydari <[email protected]>
+# # Author: Mojtaba Heydari <[email protected]>
+
+
+# from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
+# from madmom.audio.stft import ShortTimeFourierTransformProcessor
+# from madmom.audio.spectrogram import (
+#     FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor,
+#     SpectrogramDifferenceProcessor)
+# from madmom.processors import ParallelProcessor, SequentialProcessor
+# from BeatNet.common import *
+# import numpy as np
+
+
+
+# # feature extractor that extracts magnitude spectrogoram and its differences  
+
+# class LOG_SPECT(FeatureModule):
+#     def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'):
+#         sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate)
+#         self.sample_rate = sample_rate
+#         self.hop_length = hop_size
+#         self.num_channels = num_channels
+#         multi = ParallelProcessor([])
+#         frame_sizes = [win_length]  
+#         num_bands = n_bands  
+#         for frame_size, num_bands in zip(frame_sizes, num_bands):
+#             if mode == 'online' or mode == 'offline':
+#                 frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size) 
+#             else:   # for real-time and streaming modes 
+#                 frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4) 
+#             stft = ShortTimeFourierTransformProcessor()  # caching FFT window
+#             filt = FilteredSpectrogramProcessor(
+#                 num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True)
+#             spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
+#             diff = SpectrogramDifferenceProcessor(
+#                 diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
+#             # process each frame size with spec and diff sequentially
+#             multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
+#         # stack the features and processes everything sequentially
+#         self.pipe = SequentialProcessor((sig, multi, np.hstack))
 
+#     def process_audio(self, audio):
+#         feats = self.pipe(audio)
+#         return feats.T
+
+# if __name__ == '__main__':
+#     # test the feature extraction module and get features for a sample audio file
+#     audio_path = '/home/nikhil/moji/BeatNet/src/BeatNet/test_data/808kick120bpm.mp3'
+#     feats = LOG_SPECT().process_audio(audio_path)
+#     print(feats)
+
+# Author: Mojtaba Heydari <[email protected]>
 
 from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
 from madmom.audio.stft import ShortTimeFourierTransformProcessor
 from madmom.audio.spectrogram import (
     FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor,
     SpectrogramDifferenceProcessor)
 from madmom.processors import ParallelProcessor, SequentialProcessor
-from BeatNet.common import *
+from common import FeatureModule
+import numpy as np
+import sys
+sys.path.append('/home/nikhil/moji/BeatNet/src/BeatNet')
 
 
-# feature extractor that extracts magnitude spectrogoram and its differences  
+# Feature extractor that extracts magnitude spectrogram and its differences
 
 class LOG_SPECT(FeatureModule):
     def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'):
-        sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate)
+        sig = SignalProcessor(num_channels=num_channels, sample_rate=sample_rate)
         self.sample_rate = sample_rate
         self.hop_length = hop_size
         self.num_channels = num_channels
         multi = ParallelProcessor([])
         frame_sizes = [win_length]  
         num_bands = n_bands  
-        for frame_size, num_bands in zip(frame_sizes, num_bands):
-            if mode == 'online' or mode == 'offline':
-                frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size) 
-            else:   # for real-time and streaming modes 
-                frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4) 
+
+        for frame_size, num_band in zip(frame_sizes, num_bands):
+            if mode in ['online', 'offline']:
+                frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size)
+            else:  # for real-time and streaming modes 
+                frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4)
+
             stft = ShortTimeFourierTransformProcessor()  # caching FFT window
-            filt = FilteredSpectrogramProcessor(
-                num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True)
+
+            filt = FilteredSpectrogramProcessor(num_bands=num_band, fmin=30, fmax=17000, norm_filters=True)
+
             spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
-            diff = SpectrogramDifferenceProcessor(
-                diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
-            # process each frame size with spec and diff sequentially
+
+            diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
+
+
+            # Process each frame size with spec and diff sequentially
             multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
-        # stack the features and processes everything sequentially
+
+        # Stack the features and process everything sequentially
         self.pipe = SequentialProcessor((sig, multi, np.hstack))
 
     def process_audio(self, audio):
         feats = self.pipe(audio)
         return feats.T
 
+if __name__ == '__main__':
+
+    # Test the feature extraction module and get features for a sample audio file
+    audio_path = '/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav'
+    op = LOG_SPECT().process_audio(audio_path)
+    print(op.shape)
diff --git a/src/BeatNet/model.py b/src/BeatNet/model.py
@@ -38,16 +38,25 @@ def __init__(self, dim_in, num_cells, num_layers, device):
 
     def forward(self, data):
         x = data
+        print(np.shape(x), 'data in model looks like')
         x = torch.reshape(x, (-1, self.dim_in))
+        print(np.shape(x), 'after reshape')
         x = x.unsqueeze(0).transpose(0, 1)
+        print(np.shape(x), 'after unsqueeze and transpose')
         x = F.max_pool1d(F.relu(self.conv1(x)), 2)
+        print(np.shape(x), 'after max_pool1d')
         x = x.view(-1, self.num_flat_features(x))
+        print(np.shape(x), 'after view')
         x = self.linear0(x)
+        print(np.shape(x), 'after linear0')
         x = torch.reshape(x, (np.shape(data)[0], np.shape(data)[1], self.conv_out))
+        print(np.shape(x), 'after reshape')
         x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell))
-        # x = self.lstm(x)[0]
+        print(np.shape(x), 'after lstm')
         out = self.linear(x)
+        print(np.shape(out), 'after linear')
         out = out.transpose(1, 2)
+        print(np.shape(out), 'final output shape')
         return out
 
     def final_pred(self, input):

diff --git a/src/BeatNet/test.py b/src/BeatNet/test.py
@@ -0,0 +1,8 @@
+from BeatNet import BeatNet
+
+estimator = BeatNet(1, mode='online', inference_model='PF', plot=['activations'], thread=False)
+
+
+Output = estimator.process('/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav')
+
+print(Output)
diff --git a/src/BeatNet/train.py b/src/BeatNet/train.py