Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initialized #42

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
numpy
cython
librosa>=0.8.0
numba==0.54.1
scipy
mido>=1.2.6
pytest
madmom
torch
Matplotlib
48 changes: 36 additions & 12 deletions src/BeatNet/BeatNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
import torch
import numpy as np
from madmom.features import DBNDownBeatTrackingProcessor
from BeatNet.particle_filtering_cascade import particle_filter_cascade
from BeatNet.log_spect import LOG_SPECT
from particle_filtering_cascade import particle_filter_cascade
from log_spect import LOG_SPECT
import librosa
import sys
from BeatNet.model import BDA
from model import BDA
import pyaudio
import matplotlib.pyplot as plt
import time
Expand Down Expand Up @@ -89,8 +89,8 @@ def __init__(self, model, mode='online', inference_model='PF', plot=[], thread=F
rate=self.sample_rate,
input=True,
frames_per_buffer=self.log_spec_hop_length,)

def process(self, audio_path=None):
# gives output from self.pred
def process(self, audio_path=None): # takes audio path
if self.mode == "stream":
if self.inference_model != "PF":
raise RuntimeError('The infernece model should be set to "PF" for the streaming mode!')
Expand Down Expand Up @@ -138,7 +138,7 @@ def process(self, audio_path=None):
output = self.estimator(preds) # Using DBN offline inference to infer beat/downbeats
return output


elif self.mode == "offline":
if self.inference_model != "DBN":
raise RuntimeError('The infernece model should be set to "DBN" for the offline mode!')
Expand All @@ -150,7 +150,7 @@ def process(self, audio_path=None):
else:
raise RuntimeError('An audio object or file directory is required for the offline usage!')


def activation_extractor_stream(self):
# TODO:
''' Streaming window
Expand All @@ -164,15 +164,25 @@ def activation_extractor_stream(self):
self.pred = np.zeros([1,2])
else:
feats = self.proc.process_audio(self.stream_window).T[-1]
print(feats.shape,'is the shape of feats in model after it is fed to the model, extracting last')
feats = torch.from_numpy(feats)
feats = feats.unsqueeze(0).unsqueeze(0).to(self.device)
print(feats.shape,'is the shape of featsthat the DL model takes while loading to model')
pred = self.model(feats)[0]
print(pred.shape,'is the shape of pred after model, 0 is taken')
pred = self.model.final_pred(pred)
pred = pred.cpu().detach().numpy()
print(pred.shape,'is the shape of pred after detaching converted to numpy')
self.pred = np.transpose(pred[:2, :])


def activation_extractor_realtime(self, audio_path):
def activation_extractor_realtime(self, audio_path: str) -> None:
'''
Extracts activations from the audio data in real-time.

Parameters:
audio_path (str): Path to the audio file.
'''
with torch.no_grad():
if self.counter==0: #loading the audio
if isinstance(audio_path, str):
Expand All @@ -196,20 +206,34 @@ def activation_extractor_realtime(self, audio_path):
self.completed = 1


def activation_extractor_online(self, audio_path):

def activation_extractor_online(self, audio_path: str) -> np.ndarray:
'''
Extracts activations from the audio data online.

Parameters:
audio_path (str): Path to the audio file.

Returns:
np.ndarray: A numpy array containing the extracted activations.
'''
with torch.no_grad():
if isinstance(audio_path, str):
audio, _ = librosa.load(audio_path, sr=self.sample_rate) # reading the data
elif len(np.shape(audio_path))>1:
audio = np.mean(audio_path ,axis=1)
audio, _ = librosa.load(audio_path, sr=self.sample_rate) # reading the data
elif len(np.shape(audio_path)) > 1:
audio = np.mean(audio_path, axis=1)
else:
audio = audio_path
feats = self.proc.process_audio(audio).T
feats = torch.from_numpy(feats)
feats = feats.unsqueeze(0).to(self.device)
print(feats.shape,'is the shape of feats while loading to model')
preds = self.model(feats)[0] # extracting the activations by passing the feature through the NN
preds = self.model.final_pred(preds)
preds = preds.cpu().detach().numpy()
print(preds.shape,'is the shape of preds after detaching')

preds = np.transpose(preds[:2, :])
print(preds.shape,'is the shape of preds after transposing, it is final')
return preds

Binary file added src/BeatNet/__pycache__/BeatNet.cpython-38.pyc
Binary file not shown.
Binary file added src/BeatNet/__pycache__/common.cpython-38.pyc
Binary file not shown.
Binary file added src/BeatNet/__pycache__/log_spect.cpython-38.pyc
Binary file not shown.
Binary file added src/BeatNet/__pycache__/model.cpython-38.pyc
Binary file not shown.
Binary file not shown.
106 changes: 106 additions & 0 deletions src/BeatNet/dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import librosa # Ensure librosa is imported
from common import *
from log_spect import LOG_SPECT

class BeatNetDataset(Dataset):
def __init__(self, audio_dir, target_dir):
self.audio_dir = audio_dir
self.target_dir = target_dir

self.audio_path = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]
self.target_path = [os.path.join(target_dir, f) for f in os.listdir(target_dir) if f.endswith('.beats')]

if len(self.audio_path) != len(self.target_path):
raise ValueError('Number of audio files and target files do not match')




self.data_names = self._get_data_list()
self.sample_rate = 22050
self.log_spec_sample_rate = self.sample_rate
self.log_spec_hop_length = int(20 * 0.001 * self.log_spec_sample_rate)
self.log_spec_win_length = int(64 * 0.001 * self.log_spec_sample_rate)

self.proc = LOG_SPECT(sample_rate=self.log_spec_sample_rate, win_length=self.log_spec_win_length,
hop_size=self.log_spec_hop_length, n_bands=[24], mode = 'online')

def __len__(self):
return len(self.audio_path)

def __getitem__(self, idx):
data = self._get_data(self.audio_path[idx])
target = self._get_targets(self.target_path[idx])
return data, target

def _get_data(self, audio_path):
audio, _ = librosa.load(audio_path, sr=self.sample_rate)
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
feats = self.proc.process_audio(audio).T
feats = torch.from_numpy(feats)
feats = feats.unsqueeze(0) # Assuming you want a 4D tensor [1, C, H, W]
return feats

def _get_targets(self, target_path):
target_list = []
with open(target_path, 'r') as f:
for line in f:
parsed = self._text_label_to_float(line)
target_list.append(parsed)

# Using the shape of features obtained from the first audio file
sample_feats = self._get_data(self.audio_path[0])
beat_vector = np.zeros((sample_feats.shape[1], 3))

beat_times = np.array([x[0] for x in target_list]) * self.sample_rate

for time in beat_times:
spec_frame = min(int(time / self.log_spec_hop_length), beat_vector.shape[0] - 1)
for n in range(-2, 3):
if 0 <= spec_frame + n < beat_vector.shape[0]:
beat_vector[spec_frame + n] = 1.0 if n == 0 else 0.5

return torch.tensor(beat_vector)

def _get_data_list(self):
names = []
for entry in os.scandir(self.target_dir):
names.append(os.path.splitext(entry.name)[0])
return names

def _text_label_to_float(self, text):
allowed = '1234567890. \t'
filtered = ''.join([c for c in text if c in allowed])
if '\t' in filtered:
t = filtered.rstrip('\n').split('\t')
else:
t = filtered.rstrip('\n').split(' ')
return float(t[0]), float(t[1])

if __name__ == '__main__':
# Test dataloader
audio_dir = '/home/nikhil/moji/BeatNet/test/test_data/wav'
target_dir = '/home/nikhil/moji/BeatNet/test/test_data/beats'

try:
dataset = BeatNetDataset(audio_dir, target_dir)

# Fetch the first sample
sample_data, sample_target = dataset[2]

# Print data and target shapes
print('Input data shape:', sample_data.shape)
print('Target shape:', sample_target.shape)
print('Target:', sample_target)
print('Data', sample_data)

# Print the length of the dataset
print('Dataset length:', len(dataset))

except Exception as e:
print(f"An error occurred: {e}")
95 changes: 80 additions & 15 deletions src/BeatNet/log_spect.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,108 @@
# Author: Mojtaba Heydari <[email protected]>
# # Author: Mojtaba Heydari <[email protected]>


# from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
# from madmom.audio.stft import ShortTimeFourierTransformProcessor
# from madmom.audio.spectrogram import (
# FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor,
# SpectrogramDifferenceProcessor)
# from madmom.processors import ParallelProcessor, SequentialProcessor
# from BeatNet.common import *
# import numpy as np



# # feature extractor that extracts magnitude spectrogoram and its differences

# class LOG_SPECT(FeatureModule):
# def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'):
# sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate)
# self.sample_rate = sample_rate
# self.hop_length = hop_size
# self.num_channels = num_channels
# multi = ParallelProcessor([])
# frame_sizes = [win_length]
# num_bands = n_bands
# for frame_size, num_bands in zip(frame_sizes, num_bands):
# if mode == 'online' or mode == 'offline':
# frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size)
# else: # for real-time and streaming modes
# frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4)
# stft = ShortTimeFourierTransformProcessor() # caching FFT window
# filt = FilteredSpectrogramProcessor(
# num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True)
# spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
# diff = SpectrogramDifferenceProcessor(
# diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
# # process each frame size with spec and diff sequentially
# multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
# # stack the features and processes everything sequentially
# self.pipe = SequentialProcessor((sig, multi, np.hstack))

# def process_audio(self, audio):
# feats = self.pipe(audio)
# return feats.T

# if __name__ == '__main__':
# # test the feature extraction module and get features for a sample audio file
# audio_path = '/home/nikhil/moji/BeatNet/src/BeatNet/test_data/808kick120bpm.mp3'
# feats = LOG_SPECT().process_audio(audio_path)
# print(feats)

# Author: Mojtaba Heydari <[email protected]>

from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.stft import ShortTimeFourierTransformProcessor
from madmom.audio.spectrogram import (
FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor,
SpectrogramDifferenceProcessor)
from madmom.processors import ParallelProcessor, SequentialProcessor
from BeatNet.common import *
from common import FeatureModule
import numpy as np
import sys
sys.path.append('/home/nikhil/moji/BeatNet/src/BeatNet')


# feature extractor that extracts magnitude spectrogoram and its differences
# Feature extractor that extracts magnitude spectrogram and its differences

class LOG_SPECT(FeatureModule):
def __init__(self, num_channels=1, sample_rate=22050, win_length=2048, hop_size=512, n_bands=[12], mode='online'):
sig = SignalProcessor(num_channels=num_channels, win_length=win_length, sample_rate=sample_rate)
sig = SignalProcessor(num_channels=num_channels, sample_rate=sample_rate)
self.sample_rate = sample_rate
self.hop_length = hop_size
self.num_channels = num_channels
multi = ParallelProcessor([])
frame_sizes = [win_length]
num_bands = n_bands
for frame_size, num_bands in zip(frame_sizes, num_bands):
if mode == 'online' or mode == 'offline':
frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size)
else: # for real-time and streaming modes
frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4)

for frame_size, num_band in zip(frame_sizes, num_bands):
if mode in ['online', 'offline']:
frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size)
else: # for real-time and streaming modes
frames = FramedSignalProcessor(frame_size=frame_size, hop_size=hop_size, num_frames=4)

stft = ShortTimeFourierTransformProcessor() # caching FFT window
filt = FilteredSpectrogramProcessor(
num_bands=num_bands, fmin=30, fmax=17000, norm_filters=True)

filt = FilteredSpectrogramProcessor(num_bands=num_band, fmin=30, fmax=17000, norm_filters=True)

spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
diff = SpectrogramDifferenceProcessor(
diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
# process each frame size with spec and diff sequentially

diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)


# Process each frame size with spec and diff sequentially
multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
# stack the features and processes everything sequentially

# Stack the features and process everything sequentially
self.pipe = SequentialProcessor((sig, multi, np.hstack))

def process_audio(self, audio):
feats = self.pipe(audio)
return feats.T

if __name__ == '__main__':

# Test the feature extraction module and get features for a sample audio file
audio_path = '/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-01.wav'
op = LOG_SPECT().process_audio(audio_path)
print(op.shape)
11 changes: 10 additions & 1 deletion src/BeatNet/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,25 @@ def __init__(self, dim_in, num_cells, num_layers, device):

def forward(self, data):
x = data
print(np.shape(x), 'data in model looks like')
x = torch.reshape(x, (-1, self.dim_in))
print(np.shape(x), 'after reshape')
x = x.unsqueeze(0).transpose(0, 1)
print(np.shape(x), 'after unsqueeze and transpose')
x = F.max_pool1d(F.relu(self.conv1(x)), 2)
print(np.shape(x), 'after max_pool1d')
x = x.view(-1, self.num_flat_features(x))
print(np.shape(x), 'after view')
x = self.linear0(x)
print(np.shape(x), 'after linear0')
x = torch.reshape(x, (np.shape(data)[0], np.shape(data)[1], self.conv_out))
print(np.shape(x), 'after reshape')
x, (self.hidden, self.cell) = self.lstm(x, (self.hidden, self.cell))
# x = self.lstm(x)[0]
print(np.shape(x), 'after lstm')
out = self.linear(x)
print(np.shape(out), 'after linear')
out = out.transpose(1, 2)
print(np.shape(out), 'final output shape')
return out

def final_pred(self, input):
Expand Down
8 changes: 8 additions & 0 deletions src/BeatNet/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from BeatNet import BeatNet

estimator = BeatNet(1, mode='online', inference_model='PF', plot=['activations'], thread=False)


Output = estimator.process('/home/nikhil/moji/BeatNet/test/test_data/wav/Albums-AnaBelen_Veneo-02.wav')

print(Output)
Empty file added src/BeatNet/train.py
Empty file.
Loading