Skip to content

Minor bugfixes for Persephone #2456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/analysis/persephone/SeqC_pipeline/.gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
/seqc_proc/{DEPO_demo,DEPO_proc,REPO_host,REPO_tool}/
/seqc_input/
/seqc_output/
.DS_Store
#/DB/REPO_tool/kraken/taxonomy/nucl_{gb,wgs}.accession2taxid.gz
#/DB/REPO_host
Expand Down
25 changes: 19 additions & 6 deletions src/analysis/persephone/SeqC_pipeline/BASH_seqc_mama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
# flesh out func_demo to build complete demo run within /DB/DEPO_demo
# auto compile file list from input dir in absence of provided list
# expand splash to include system params: cpu, mem, du of key directories
# implement pv for progress bar... tar -I pigz -xvf stuff.tar.gz | pv
#refs
# prodigal:https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-119#citeas
# https://www.cyberciti.biz/tips/bash-shell-parameter-substitution-2.html
Expand All @@ -53,6 +54,8 @@ var_rep_len=${#var_rep_ary[@]}
#size check -db
v_vol_real=$(du -bs $v_dir_db | cut -f1 )
v_vol_none=60000
#system max cpu
v_sys_mem=$(nproc)
v_scrp_check=0
#var_rep_ary[var_rep_len]='x/y/z/cat'*'.kitty'
#var_rep_out=${var_rep_ary[@]}
Expand Down Expand Up @@ -1518,6 +1521,7 @@ if (( ${vopt_log} ));then
fi
printf 'Input file type: %s\n%s\n' "${vopt_file_type}" "${v_logblock1}" >> ${v_logfile}
# Check zip status - CRIT - extend to .tar files in dir
# pot. just exclude .tar
vt_L=$( printf '%s\n' "${v_file_in_good[@]}" | grep -E "\.gz|gzip$" | wc -l )
vt_R='0'
if [[ ${vt_L} -gt ${vt_R} ]];then
Expand Down Expand Up @@ -1724,12 +1728,21 @@ for istep in "${vopt_step[@]}";do
vin_head=''
vin_tail=''
fi
# proc check - ensure that too many processes are not called
if [[ -z ${venv_proc_max} ]];then
venv_proc_max=$(( ${venv_cpu_max} / 2 ))
fi
if [[ $(printf %.0f $(echo "${venv_cpu_max} * ${venv_proc_max}" | bc -l)) -gt "${v_sys_mem}" ]];then
vt="${venv_proc_max}"
while [[ $(printf %.0f $(echo "${venv_cpu_max} * ${vt}" | bc -l)) -gt "${v_sys_mem}" ]];do
((vt--))
#printf 'in%s\n' "${vt}"
done
#printf 'out%s\n' "${vt}"
venv_proc_max="${vt}"
fi
if [[ ${vopt_com_log} -eq 0 ]];then
#insert as awk print block - ex: vopt_com='{ printf "--output-format tsv --output-basename %s",va_nID }'
#improve with adjustment to :--processes <1>
if [[ -z ${venv_proc_max} ]];then
venv_proc_max=$(( ${venv_cpu_max} / 2 ))
fi
vin_com='{ printf " --remove-intermediate-output --reference-db %s --threads %s --processes %s --max-memory %sg --trimmomatic /opt/conda/envs/env_s1_kneaddata/share/trimmomatic --reorder","'${vin_host_rm}'",'${venv_cpu_max}','${venv_proc_max}','${venv_mem_max}' }'
fi
#WORKING EX
Expand Down Expand Up @@ -1986,7 +1999,7 @@ for istep in "${vopt_step[@]}";do
v_k2mpa_out=${v_kjoin_out/out/mpa_out_RA}
cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_frac} > "${v_k2mpa_out}"
#drop col type for match with meta data
sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
#counts
v_match='num'
v_bk_cols=( $( head ${v_kjoin_out/out/mpa_out} -n 1 ) )
Expand All @@ -2006,7 +2019,7 @@ for istep in "${vopt_step[@]}";do
v_k2mpa_out=${v_kjoin_out/out/mpa_out_RC}
cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_num} > "${v_k2mpa_out}"
#drop col type for match with meta data
sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
#default mars input is now "${v_k2mpa_out}" = /DB/DEPO_proc/step2_kraken/KB_S_mpa_out_RC.txt
vout_sX=${vout_s2}
# step keep-clean
Expand Down
9 changes: 5 additions & 4 deletions src/analysis/persephone/createBatchMWBM.m
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
parser.addParameter('checkFeasibility', true, @islogical);
parser.addParameter('wbmDirectory', '', @ischar);
parser.addParameter('solver', 'gurobi', @ischar);
parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_04c', @ischar);
parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_04c', @ischar);
parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_03d', @ischar);
parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_03d', @ischar);

% Parse required and optional inputs
parser.parse(mgpipePath, mWBMPath, metadataPath, varargin{:});
Expand Down Expand Up @@ -201,8 +201,9 @@
% Make sure that each iWBM is paired with their corresponding
% microbiota model
iWBMnames = what(wbmDirectory).mat;
iWBMnames = extractBetween(iWBMnames,'iWBM_','.mat');

iWBMnames = replace(iWBMnames, {'iWBM_','miWBM_','iWBM','miWBM','.mat'},'');
%iWBMnames = extractAfter(iWBMnames,'iWBM','.mat');
%iWBMnames = extractAfter(iWBMnames,'_','.mat');
% Find the intersection between the iWBM and microbiome samples and
% reorder the name arrays.
[~,ia,ib] = intersect(iWBMnames,microbiomeSamples,'stable');
Expand Down
35 changes: 26 additions & 9 deletions src/analysis/persephone/initPersephone.m
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,36 @@
paths (1, :) {mustBeNonempty}
end

%%% Test for Matlab toolbox dependency issues %%%

% Check if the parallel toolbox is installed (Required)
if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
%%% Test for Matlab toolbox dependency issues %%%
%fix for error from missing toolbox - stats and ML - wb 20250305
sysAddons = matlab.addons.installedAddons();
if any(strcmp(sysAddons.Name,'Parallel Computing Toolbox'))
% Check if the parallel toolbox is installed (Required)
if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
else
if ~license('test','Distrib_Computing_Toolbox')
error('It seems the Paralell Computing Toolbox is installed but no valid license exists. Please consider updating/obtaining a license, it is required to generate microbiome models, HM models and to generate flux results.')
end
end
end

% Check if the statistics toolbox is installed (Not critical, but recommended)
if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
statToolboxInstalled = false;
warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
if any(strcmp(sysAddons.Name,'Statistics and Machine Learning Toolbox'))
if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
statToolboxInstalled = false;
warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
else
if license('test', 'Statistics_Toolbox')
statToolboxInstalled = true;
else
warning('It seems the Statistics and Machine Learning Toolbox installed but no valid license exists. Please consider updating/obtaining a license as it is required for analysis')
statToolboxInstalled = false;
end
end
else
statToolboxInstalled = true;
statToolboxInstalled = false;

end


Expand Down
22 changes: 21 additions & 1 deletion src/analysis/persephone/persWBM.m
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,30 @@
if ~ischar(metadataPath)
error('metadata data does not satisfy the input requirments (iscell, ischar, istable)')
else
Data = readMetadataForPersephone(metadataPath);
% Read the metadata table. If the variable names have more than 64
% characters, the variable names in the table will be truncated. We account
% for this later, so this warning is ignored here.
warning('off')
opts = detectImportOptions(metadataPath, 'VariableNamingRule', 'preserve');
opts = setvartype(opts, opts.VariableNames(1), 'string');
Data = readtable(metadataPath, opts);
warning('on')

% Check if metadata table is not empty
validateattributes(Data, {'table'}, {'nonempty'}, mfilename, 'metadataTable')

% The variable names in the metadata table will be truncated if the names
% are longer than 64 characters. Read the true variable names and store
% them in the VariableDescriptions property.
% Next, the first 2 lines of the metadata are loaded
unprocessedMetadata = strrep(metadataPath, '_processed', '');
metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
Data.Properties.VariableUnits = string(metadataCell(2,:));

end



% Check Data has units stored in table properties
if ~strcmp('ID', Data.Properties.VariableNames{1})
error('Please ensure column 1 of your metadata is ID and is labelled accordingly')
Expand Down
25 changes: 5 additions & 20 deletions src/analysis/persephone/readMetadataForPersephone.m
Original file line number Diff line number Diff line change
Expand Up @@ -45,28 +45,13 @@
% Check if metadata table is not empty
validateattributes(metadataTable, {'table'}, {'nonempty'}, mfilename, 'metadataTable')

% The variable names in the metadata table will be truncated if the names
% are longer than 64 characters. Read the true variable names and store
% them in the VariableDescriptions property.
% Next, the first 2 lines of the metadata are loaded
unprocessedMetadata = strrep(metadataPath, '_processed', '');
metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
metadataTable.Properties.VariableDescriptions = string(metadataCell(1,:));

% If the metadata table has a second row with units, we need to remove that
% row from the loaded metadataTable and store it in the VariableUnits
% property.
if matches(metadataCell{2,1},{'unit','units'},'ignoreCase',true)

% % If the metadata table has a second row with units, we need to remove that
% % row from the loaded metadataTable and store it in the VariableUnits
% % property.
if any(matches(metadataTable{1,1},{'unit','units'},'ignoreCase',true))
% Remove the second header line from the table
% metadataTable(1,:) = [];

% Get the variable units
units = string(metadataCell(2,:));
units(ismissing(units)) = "NA";

% Add unit information to the table
metadataTable.Properties.VariableUnits = units;
metadataTable(1,:) = [];
end

% Set output
Expand Down
3 changes: 2 additions & 1 deletion src/analysis/persephone/runMars.m
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ function runMars(pythonPath, marsRepoPath, cutoffMARS, outputExtensionMARS, flag
py.importlib.import_module('MARS');

%Enter folder that contains the "main.py" script
cd(strcat(marsRepoPath, '\MARS'));

cd(fullfile(marsRepoPath, 'MARS'));

% Set all optional inputs in Python readable format
marsOptArg = pyargs('cutoff', cutoffMARS, ...
Expand Down
89 changes: 44 additions & 45 deletions src/analysis/persephone/runPersephone.m
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@
tic;
% Load all variables defined in the configPersephone.m
run(configPath);

resultPath
% Do not run MARS if seqC was run.
if paths.seqC.flagSeqC == true
paths.Mars.paths.Mars.flagMars = false;
Expand All @@ -284,13 +284,15 @@

% Validate the set workspace variables and give descriptive errors on which
% variables were not set correctly.
resultPath
validatePersephoneInputs(paths,resultPath);

% Create log file on the pipeline setup
diary([resultPath filesep 'logFile_initialisation.txt'])
resultPath
diary(fullfile(resultPath, 'logFile_initialisation.txt'))

% Initialise PERSEPHONE
[~, ~, updatedMetadataPath] = initPersephone(resultPath, paths);
[~, statToolboxInstalled, updatedMetadataPath] = initPersephone(resultPath, paths);
paths.General.metadataPath = updatedMetadataPath;

% Validate the path to the given diet.
Expand Down Expand Up @@ -412,46 +414,43 @@
disp(' > Calculation of relative abundances with MARS is skipped')
end
diary('off'); % Save MARS logfile
%% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
% Remove samples in the metadata that are not in the microbiome data
if ~isempty(paths.Mars.relAbunFilePath)

% Load microbiome data
relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');

% Check if the microbiome table is empty
validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')

% Check if a column named Taxon exists
if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
end

% Read in the metadata
metadata = readMetadataForPersephone(paths.General.metadataPath);

% Find the intersection of samples between the readsTable and the
% metadata table
[~,ia,ib] = intersect(metadata.ID, ...
relAbunFile.Properties.VariableNames(2:end)','stable');

% Perform checks
if isempty(ia)
error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
else
numRemovedSamples = size(metadata,1)-numel(ia);
if numRemovedSamples>0
disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
end
relAbunFile = relAbunFile(:,[1; 1+ib]);
metadata = metadata(ia,:);
end

% Remove all samples in the metadata that are not in the readsTable
disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
writetable(relAbunFile, paths.Mars.relAbunFilePath);
writetable(metadata,paths.General.metadataPath);
end
% %% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
% % Remove samples in the metadata that are not in the microbiome data
% if ~isempty(paths.Mars.relAbunFilePath)
%
% % Load microbiome data
% relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');
%
% % Check if the microbiome table is empty
% validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')
%
% % Check if a column named Taxon exists
% if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
% error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
% end
%
% % Read in the metadata
% metadata = readMetadataForPersephone(paths.General.metadataPath);
%
% % Find the intersection of samples between the readsTable and the
% % metadata table
% [~,ia] = intersect(metadata.ID, ...
% relAbunFile.Properties.VariableNames(2:end)','stable');
% % Perform checks
% if isempty(ia)
% error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
% else
% numRemovedSamples = size(metadata,1)-numel(ia);
% if numRemovedSamples>0
% disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
% end
% metadata = metadata(ia,:);
% end
%
% % Remove all samples in the metadata that are not in the readsTable
% disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
% writetable(metadata,paths.General.metadataPath);
% end

%% Part 2:
%%%%%%%%%%%%%%%%%%%%%%% Create microbiome models %%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down Expand Up @@ -611,7 +610,7 @@
%% Part 6:
%%%%%%%%%%%%%%%%%%%%%% Analysis of flux results %%%%%%%%%%%%%%%%%%%%%%%%%%%
tic;
if paths.stats.flagStatistics
if paths.stats.flagStatistics && statToolboxInstalled
diary([resultPath filesep 'logFile_statistics.txt']) % Open diary for log file

for i = 1:size(paths.stats.response,2)
Expand Down Expand Up @@ -644,7 +643,7 @@
end
totalT= 0;
fields = fieldnames(progress);
for i = 1:numel(fields)
for i = 1:numel(fields)-1
data = progress.(fields{i});
totalT = totalT + data(2);
end
Expand Down
4 changes: 2 additions & 2 deletions src/analysis/persephone/validatePersephoneInputs.m
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@

% Validate inputs for statistics
if paths.stats.flagStatistics
validateattributes(paths.stats.outputPathStatistics,{'char','string'},{'nonempty'},'outputPathStatistics')
validateattributes(paths.stats.outputPathStatistics,{'char','string','cell'},{'nonempty'},'outputPathStatistics')
validateattributes(paths.stats.response,{'char','string','cell'},{'nonempty'},'response')
validateattributes(paths.stats.confounders,{'char','string','cell'},{'nonempty'},'confounders')
validateattributes(paths.stats.confounders,{'char','string','cell'},{''},'confounders')
validateattributes(paths.stats.moderationAnalysis,{'logical'},{'scalar'},'moderationAnalysis')
end

Expand Down