Skip to content

Commit b967fd3

Browse files
authored
Merge pull request #2456 from bramnap/develop
Minor bugfixes for Persephone
2 parents a5bf6a3 + 90de3c8 commit b967fd3

9 files changed

+124
-90
lines changed

src/analysis/persephone/SeqC_pipeline/.gitignore

-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
/seqc_proc/{DEPO_demo,DEPO_proc,REPO_host,REPO_tool}/
2-
/seqc_input/
3-
/seqc_output/
42
.DS_Store
53
#/DB/REPO_tool/kraken/taxonomy/nucl_{gb,wgs}.accession2taxid.gz
64
#/DB/REPO_host

src/analysis/persephone/SeqC_pipeline/BASH_seqc_mama.sh

+19-6
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
# flesh out func_demo to build complete demo run within /DB/DEPO_demo
3737
# auto compile file list from input dir in absence of provided list
3838
# expand splash to include system params: cpu, mem, du of key directories
39+
# implement pv for progress bar... tar -I pigz -xvf stuff.tar.gz | pv
3940
#refs
4041
# prodigal:https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-119#citeas
4142
# https://www.cyberciti.biz/tips/bash-shell-parameter-substitution-2.html
@@ -53,6 +54,8 @@ var_rep_len=${#var_rep_ary[@]}
5354
#size check -db
5455
v_vol_real=$(du -bs $v_dir_db | cut -f1 )
5556
v_vol_none=60000
57+
#system max cpu
58+
v_sys_mem=$(nproc)
5659
v_scrp_check=0
5760
#var_rep_ary[var_rep_len]='x/y/z/cat'*'.kitty'
5861
#var_rep_out=${var_rep_ary[@]}
@@ -1518,6 +1521,7 @@ if (( ${vopt_log} ));then
15181521
fi
15191522
printf 'Input file type: %s\n%s\n' "${vopt_file_type}" "${v_logblock1}" >> ${v_logfile}
15201523
# Check zip status - CRIT - extend to .tar files in dir
1524+
# pot. just exclude .tar
15211525
vt_L=$( printf '%s\n' "${v_file_in_good[@]}" | grep -E "\.gz|gzip$" | wc -l )
15221526
vt_R='0'
15231527
if [[ ${vt_L} -gt ${vt_R} ]];then
@@ -1724,12 +1728,21 @@ for istep in "${vopt_step[@]}";do
17241728
vin_head=''
17251729
vin_tail=''
17261730
fi
1731+
# proc check - ensure that too many processes are not called
1732+
if [[ -z ${venv_proc_max} ]];then
1733+
venv_proc_max=$(( ${venv_cpu_max} / 2 ))
1734+
fi
1735+
if [[ $(printf %.0f $(echo "${venv_cpu_max} * ${venv_proc_max}" | bc -l)) -gt "${v_sys_mem}" ]];then
1736+
vt="${venv_proc_max}"
1737+
while [[ $(printf %.0f $(echo "${venv_cpu_max} * ${vt}" | bc -l)) -gt "${v_sys_mem}" ]];do
1738+
((vt--))
1739+
#printf 'in%s\n' "${vt}"
1740+
done
1741+
#printf 'out%s\n' "${vt}"
1742+
venv_proc_max="${vt}"
1743+
fi
17271744
if [[ ${vopt_com_log} -eq 0 ]];then
17281745
#insert as awk print block - ex: vopt_com='{ printf "--output-format tsv --output-basename %s",va_nID }'
1729-
#improve with adjustment to :--processes <1>
1730-
if [[ -z ${venv_proc_max} ]];then
1731-
venv_proc_max=$(( ${venv_cpu_max} / 2 ))
1732-
fi
17331746
vin_com='{ printf " --remove-intermediate-output --reference-db %s --threads %s --processes %s --max-memory %sg --trimmomatic /opt/conda/envs/env_s1_kneaddata/share/trimmomatic --reorder","'${vin_host_rm}'",'${venv_cpu_max}','${venv_proc_max}','${venv_mem_max}' }'
17341747
fi
17351748
#WORKING EX
@@ -1986,7 +1999,7 @@ for istep in "${vopt_step[@]}";do
19861999
v_k2mpa_out=${v_kjoin_out/out/mpa_out_RA}
19872000
cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_frac} > "${v_k2mpa_out}"
19882001
#drop col type for match with meta data
1989-
sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
2002+
sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
19902003
#counts
19912004
v_match='num'
19922005
v_bk_cols=( $( head ${v_kjoin_out/out/mpa_out} -n 1 ) )
@@ -2006,7 +2019,7 @@ for istep in "${vopt_step[@]}";do
20062019
v_k2mpa_out=${v_kjoin_out/out/mpa_out_RC}
20072020
cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_num} > "${v_k2mpa_out}"
20082021
#drop col type for match with meta data
2009-
sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
2022+
sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
20102023
#default mars input is now "${v_k2mpa_out}" = /DB/DEPO_proc/step2_kraken/KB_S_mpa_out_RC.txt
20112024
vout_sX=${vout_s2}
20122025
# step keep-clean

src/analysis/persephone/createBatchMWBM.m

+5-4
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@
5050
parser.addParameter('checkFeasibility', true, @islogical);
5151
parser.addParameter('wbmDirectory', '', @ischar);
5252
parser.addParameter('solver', 'gurobi', @ischar);
53-
parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_04c', @ischar);
54-
parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_04c', @ischar);
53+
parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_03d', @ischar);
54+
parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_03d', @ischar);
5555

5656
% Parse required and optional inputs
5757
parser.parse(mgpipePath, mWBMPath, metadataPath, varargin{:});
@@ -201,8 +201,9 @@
201201
% Make sure that each iWBM is paired with their corresponding
202202
% microbiota model
203203
iWBMnames = what(wbmDirectory).mat;
204-
iWBMnames = extractBetween(iWBMnames,'iWBM_','.mat');
205-
204+
iWBMnames = replace(iWBMnames, {'iWBM_','miWBM_','iWBM','miWBM','.mat'},'');
205+
%iWBMnames = extractAfter(iWBMnames,'iWBM','.mat');
206+
%iWBMnames = extractAfter(iWBMnames,'_','.mat');
206207
% Find the intersection between the iWBM and microbiome samples and
207208
% reorder the name arrays.
208209
[~,ia,ib] = intersect(iWBMnames,microbiomeSamples,'stable');

src/analysis/persephone/initPersephone.m

+26-9
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,36 @@
4949
paths (1, :) {mustBeNonempty}
5050
end
5151

52-
%%% Test for Matlab toolbox dependency issues %%%
5352

54-
% Check if the parallel toolbox is installed (Required)
55-
if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
56-
error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
53+
%%% Test for Matlab toolbox dependency issues %%%
54+
%fix for error from missing toolbox - stats and ML - wb 20250305
55+
sysAddons = matlab.addons.installedAddons();
56+
if any(strcmp(sysAddons.Name,'Parallel Computing Toolbox'))
57+
% Check if the parallel toolbox is installed (Required)
58+
if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
59+
error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
60+
else
61+
if ~license('test','Distrib_Computing_Toolbox')
62+
error('It seems the Paralell Computing Toolbox is installed but no valid license exists. Please consider updating/obtaining a license, it is required to generate microbiome models, HM models and to generate flux results.')
63+
end
64+
end
5765
end
58-
5966
% Check if the statistics toolbox is installed (Not critical, but recommended)
60-
if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
61-
statToolboxInstalled = false;
62-
warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
67+
if any(strcmp(sysAddons.Name,'Statistics and Machine Learning Toolbox'))
68+
if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
69+
statToolboxInstalled = false;
70+
warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
71+
else
72+
if license('test', 'Statistics_Toolbox')
73+
statToolboxInstalled = true;
74+
else
75+
warning('It seems the Statistics and Machine Learning Toolbox installed but no valid license exists. Please consider updating/obtaining a license as it is required for analysis')
76+
statToolboxInstalled = false;
77+
end
78+
end
6379
else
64-
statToolboxInstalled = true;
80+
statToolboxInstalled = false;
81+
6582
end
6683

6784

src/analysis/persephone/persWBM.m

+21-1
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,30 @@
135135
if ~ischar(metadataPath)
136136
error('metadata data does not satisfy the input requirments (iscell, ischar, istable)')
137137
else
138-
Data = readMetadataForPersephone(metadataPath);
138+
% Read the metadata table. If the variable names have more than 64
139+
% characters, the variable names in the table will be truncated. We account
140+
% for this later, so this warning is ignored here.
141+
warning('off')
142+
opts = detectImportOptions(metadataPath, 'VariableNamingRule', 'preserve');
143+
opts = setvartype(opts, opts.VariableNames(1), 'string');
144+
Data = readtable(metadataPath, opts);
145+
warning('on')
146+
147+
% Check if metadata table is not empty
148+
validateattributes(Data, {'table'}, {'nonempty'}, mfilename, 'metadataTable')
149+
150+
% The variable names in the metadata table will be truncated if the names
151+
% are longer than 64 characters. Read the true variable names and store
152+
% them in the VariableDescriptions property.
153+
% Next, the first 2 lines of the metadata are loaded
154+
unprocessedMetadata = strrep(metadataPath, '_processed', '');
155+
metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
156+
Data.Properties.VariableUnits = string(metadataCell(2,:));
157+
139158
end
140159

141160

161+
142162
% Check Data has units stored in table properties
143163
if ~strcmp('ID', Data.Properties.VariableNames{1})
144164
error('Please ensure column 1 of your metadata is ID and is labelled accordingly')

src/analysis/persephone/readMetadataForPersephone.m

+5-20
Original file line numberDiff line numberDiff line change
@@ -45,28 +45,13 @@
4545
% Check if metadata table is not empty
4646
validateattributes(metadataTable, {'table'}, {'nonempty'}, mfilename, 'metadataTable')
4747

48-
% The variable names in the metadata table will be truncated if the names
49-
% are longer than 64 characters. Read the true variable names and store
50-
% them in the VariableDescriptions property.
51-
% Next, the first 2 lines of the metadata are loaded
52-
unprocessedMetadata = strrep(metadataPath, '_processed', '');
53-
metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
54-
metadataTable.Properties.VariableDescriptions = string(metadataCell(1,:));
55-
56-
% If the metadata table has a second row with units, we need to remove that
57-
% row from the loaded metadataTable and store it in the VariableUnits
58-
% property.
59-
if matches(metadataCell{2,1},{'unit','units'},'ignoreCase',true)
6048

49+
% % If the metadata table has a second row with units, we need to remove that
50+
% % row from the loaded metadataTable and store it in the VariableUnits
51+
% % property.
52+
if any(matches(metadataTable{1,1},{'unit','units'},'ignoreCase',true))
6153
% Remove the second header line from the table
62-
% metadataTable(1,:) = [];
63-
64-
% Get the variable units
65-
units = string(metadataCell(2,:));
66-
units(ismissing(units)) = "NA";
67-
68-
% Add unit information to the table
69-
metadataTable.Properties.VariableUnits = units;
54+
metadataTable(1,:) = [];
7055
end
7156

7257
% Set output

src/analysis/persephone/runMars.m

+2-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ function runMars(pythonPath, marsRepoPath, cutoffMARS, outputExtensionMARS, flag
5757
py.importlib.import_module('MARS');
5858

5959
%Enter folder that contains the "main.py" script
60-
cd(strcat(marsRepoPath, '\MARS'));
60+
61+
cd(fullfile(marsRepoPath, 'MARS'));
6162

6263
% Set all optional inputs in Python readable format
6364
marsOptArg = pyargs('cutoff', cutoffMARS, ...

src/analysis/persephone/runPersephone.m

+44-45
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@
273273
tic;
274274
% Load all variables defined in the configPersephone.m
275275
run(configPath);
276-
276+
resultPath
277277
% Do not run MARS if seqC was run.
278278
if paths.seqC.flagSeqC == true
279279
paths.Mars.paths.Mars.flagMars = false;
@@ -284,13 +284,15 @@
284284

285285
% Validate the set workspace variables and give descriptive errors on which
286286
% variables were not set correctly.
287+
resultPath
287288
validatePersephoneInputs(paths,resultPath);
288289

289290
% Create log file on the pipeline setup
290-
diary([resultPath filesep 'logFile_initialisation.txt'])
291+
resultPath
292+
diary(fullfile(resultPath, 'logFile_initialisation.txt'))
291293

292294
% Initialise PERSEPHONE
293-
[~, ~, updatedMetadataPath] = initPersephone(resultPath, paths);
295+
[~, statToolboxInstalled, updatedMetadataPath] = initPersephone(resultPath, paths);
294296
paths.General.metadataPath = updatedMetadataPath;
295297

296298
% Validate the path to the given diet.
@@ -412,46 +414,43 @@
412414
disp(' > Calculation of relative abundances with MARS is skipped')
413415
end
414416
diary('off'); % Save MARS logfile
415-
%% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
416-
% Remove samples in the metadata that are not in the microbiome data
417-
if ~isempty(paths.Mars.relAbunFilePath)
418-
419-
% Load microbiome data
420-
relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');
421-
422-
% Check if the microbiome table is empty
423-
validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')
424-
425-
% Check if a column named Taxon exists
426-
if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
427-
error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
428-
end
429-
430-
% Read in the metadata
431-
metadata = readMetadataForPersephone(paths.General.metadataPath);
432-
433-
% Find the intersection of samples between the readsTable and the
434-
% metadata table
435-
[~,ia,ib] = intersect(metadata.ID, ...
436-
relAbunFile.Properties.VariableNames(2:end)','stable');
437-
438-
% Perform checks
439-
if isempty(ia)
440-
error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
441-
else
442-
numRemovedSamples = size(metadata,1)-numel(ia);
443-
if numRemovedSamples>0
444-
disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
445-
end
446-
relAbunFile = relAbunFile(:,[1; 1+ib]);
447-
metadata = metadata(ia,:);
448-
end
449-
450-
% Remove all samples in the metadata that are not in the readsTable
451-
disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
452-
writetable(relAbunFile, paths.Mars.relAbunFilePath);
453-
writetable(metadata,paths.General.metadataPath);
454-
end
417+
% %% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
418+
% % Remove samples in the metadata that are not in the microbiome data
419+
% if ~isempty(paths.Mars.relAbunFilePath)
420+
%
421+
% % Load microbiome data
422+
% relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');
423+
%
424+
% % Check if the microbiome table is empty
425+
% validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')
426+
%
427+
% % Check if a column named Taxon exists
428+
% if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
429+
% error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
430+
% end
431+
%
432+
% % Read in the metadata
433+
% metadata = readMetadataForPersephone(paths.General.metadataPath);
434+
%
435+
% % Find the intersection of samples between the readsTable and the
436+
% % metadata table
437+
% [~,ia] = intersect(metadata.ID, ...
438+
% relAbunFile.Properties.VariableNames(2:end)','stable');
439+
% % Perform checks
440+
% if isempty(ia)
441+
% error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
442+
% else
443+
% numRemovedSamples = size(metadata,1)-numel(ia);
444+
% if numRemovedSamples>0
445+
% disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
446+
% end
447+
% metadata = metadata(ia,:);
448+
% end
449+
%
450+
% % Remove all samples in the metadata that are not in the readsTable
451+
% disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
452+
% writetable(metadata,paths.General.metadataPath);
453+
% end
455454

456455
%% Part 2:
457456
%%%%%%%%%%%%%%%%%%%%%%% Create microbiome models %%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -611,7 +610,7 @@
611610
%% Part 6:
612611
%%%%%%%%%%%%%%%%%%%%%% Analysis of flux results %%%%%%%%%%%%%%%%%%%%%%%%%%%
613612
tic;
614-
if paths.stats.flagStatistics
613+
if paths.stats.flagStatistics && statToolboxInstalled
615614
diary([resultPath filesep 'logFile_statistics.txt']) % Open diary for log file
616615

617616
for i = 1:size(paths.stats.response,2)
@@ -644,7 +643,7 @@
644643
end
645644
totalT= 0;
646645
fields = fieldnames(progress);
647-
for i = 1:numel(fields)
646+
for i = 1:numel(fields)-1
648647
data = progress.(fields{i});
649648
totalT = totalT + data(2);
650649
end

src/analysis/persephone/validatePersephoneInputs.m

+2-2
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@
109109

110110
% Validate inputs for statistics
111111
if paths.stats.flagStatistics
112-
validateattributes(paths.stats.outputPathStatistics,{'char','string'},{'nonempty'},'outputPathStatistics')
112+
validateattributes(paths.stats.outputPathStatistics,{'char','string','cell'},{'nonempty'},'outputPathStatistics')
113113
validateattributes(paths.stats.response,{'char','string','cell'},{'nonempty'},'response')
114-
validateattributes(paths.stats.confounders,{'char','string','cell'},{'nonempty'},'confounders')
114+
validateattributes(paths.stats.confounders,{'char','string','cell'},{''},'confounders')
115115
validateattributes(paths.stats.moderationAnalysis,{'logical'},{'scalar'},'moderationAnalysis')
116116
end
117117

0 commit comments

Comments
 (0)