opencobra · farid-zare · Mar 13, 2025 · Mar 12, 2025
diff --git a/src/analysis/persephone/SeqC_pipeline/.gitignore b/src/analysis/persephone/SeqC_pipeline/.gitignore
@@ -1,6 +1,4 @@
 /seqc_proc/{DEPO_demo,DEPO_proc,REPO_host,REPO_tool}/
-/seqc_input/
-/seqc_output/
 .DS_Store
 #/DB/REPO_tool/kraken/taxonomy/nucl_{gb,wgs}.accession2taxid.gz
 #/DB/REPO_host

diff --git a/src/analysis/persephone/SeqC_pipeline/BASH_seqc_mama.sh b/src/analysis/persephone/SeqC_pipeline/BASH_seqc_mama.sh
@@ -36,6 +36,7 @@
 #  flesh out func_demo to build complete demo run within /DB/DEPO_demo
 #  auto compile file list from input dir in absence of provided list
 #  expand splash to include system params: cpu, mem, du of key directories
+#  implement pv for progress bar... tar -I pigz -xvf stuff.tar.gz | pv
 #refs
 # prodigal:https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-119#citeas
 # https://www.cyberciti.biz/tips/bash-shell-parameter-substitution-2.html
@@ -53,6 +54,8 @@ var_rep_len=${#var_rep_ary[@]}
 #size check -db
 v_vol_real=$(du -bs $v_dir_db | cut -f1 )
 v_vol_none=60000
+#system max cpu
+v_sys_mem=$(nproc)
 v_scrp_check=0
 #var_rep_ary[var_rep_len]='x/y/z/cat'*'.kitty'
 #var_rep_out=${var_rep_ary[@]}
@@ -1518,6 +1521,7 @@ if (( ${vopt_log} ));then
 		fi
 		printf 'Input file type: %s\n%s\n' "${vopt_file_type}" "${v_logblock1}" >> ${v_logfile}
 		# Check zip status - CRIT - extend to .tar files in dir
+		#	pot. just exclude .tar
 		vt_L=$( printf '%s\n' "${v_file_in_good[@]}" | grep -E "\.gz|gzip$" | wc -l )
 		vt_R='0'
 		if [[ ${vt_L} -gt ${vt_R} ]];then
@@ -1724,12 +1728,21 @@ for istep in "${vopt_step[@]}";do
 			vin_head=''
 			vin_tail=''
 		fi
+		# proc check - ensure that too many processes are not called
+		if [[ -z ${venv_proc_max} ]];then
+			venv_proc_max=$(( ${venv_cpu_max} / 2 ))
+		fi
+		if [[ $(printf %.0f $(echo "${venv_cpu_max} * ${venv_proc_max}" | bc -l)) -gt "${v_sys_mem}" ]];then
+			vt="${venv_proc_max}"
+			while [[ $(printf %.0f $(echo "${venv_cpu_max} * ${vt}" | bc -l)) -gt "${v_sys_mem}" ]];do
+				((vt--))
+				#printf 'in%s\n' "${vt}"
+			done
+			#printf 'out%s\n' "${vt}"
+			venv_proc_max="${vt}"
+		fi
 		if [[ ${vopt_com_log} -eq 0 ]];then
 			#insert as awk print block - ex: vopt_com='{ printf "--output-format tsv --output-basename %s",va_nID }'
-			#improve with adjustment to :--processes <1>
-			if [[ -z ${venv_proc_max} ]];then
-				venv_proc_max=$(( ${venv_cpu_max} / 2 ))
-			fi
 			vin_com='{ printf " --remove-intermediate-output --reference-db %s --threads %s --processes %s --max-memory %sg --trimmomatic /opt/conda/envs/env_s1_kneaddata/share/trimmomatic --reorder","'${vin_host_rm}'",'${venv_cpu_max}','${venv_proc_max}','${venv_mem_max}' }'
 		fi
 		#WORKING EX
@@ -1986,7 +1999,7 @@ for istep in "${vopt_step[@]}";do
 					v_k2mpa_out=${v_kjoin_out/out/mpa_out_RA}
 					cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_frac} > "${v_k2mpa_out}"
 					#drop col type for match with meta data
-					sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
+					sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
 					#counts
 					v_match='num'
 					v_bk_cols=( $( head ${v_kjoin_out/out/mpa_out} -n 1 ) )
@@ -2006,7 +2019,7 @@ for istep in "${vopt_step[@]}";do
 					v_k2mpa_out=${v_kjoin_out/out/mpa_out_RC}
 					cat ${v_kjoin_out/out/mpa_out} | cut --fields 2${v_bk_col_num} > "${v_k2mpa_out}"
 					#drop col type for match with meta data
-					sed -i "1 s/_$v_match//" "${v_k2mpa_out}"
+					sed -i "1,1s/_$v_match//g" "${v_k2mpa_out}"
 					#default mars input is now "${v_k2mpa_out}" = /DB/DEPO_proc/step2_kraken/KB_S_mpa_out_RC.txt
 					vout_sX=${vout_s2}
 					# step keep-clean

diff --git a/src/analysis/persephone/createBatchMWBM.m b/src/analysis/persephone/createBatchMWBM.m
@@ -50,8 +50,8 @@
 parser.addParameter('checkFeasibility', true, @islogical);
 parser.addParameter('wbmDirectory', '', @ischar);
 parser.addParameter('solver', 'gurobi', @ischar);
-parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_04c', @ischar);
-parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_04c', @ischar);
+parser.addParameter('maleUnpersonalisedWBMpath', 'Harvey_1_03d', @ischar);
+parser.addParameter('femaleUnpersonalisedWBMpath', 'Harvetta_1_03d', @ischar);
 
 % Parse required and optional inputs
 parser.parse(mgpipePath, mWBMPath, metadataPath, varargin{:});
@@ -201,8 +201,9 @@
         % Make sure that each iWBM is paired with their corresponding
         % microbiota model
         iWBMnames = what(wbmDirectory).mat;
-        iWBMnames = extractBetween(iWBMnames,'iWBM_','.mat');
-
+        iWBMnames = replace(iWBMnames, {'iWBM_','miWBM_','iWBM','miWBM','.mat'},'');
+        %iWBMnames = extractAfter(iWBMnames,'iWBM','.mat');
+        %iWBMnames = extractAfter(iWBMnames,'_','.mat');
         % Find the intersection between the iWBM and microbiome samples and
         % reorder the name arrays.
         [~,ia,ib] = intersect(iWBMnames,microbiomeSamples,'stable');

diff --git a/src/analysis/persephone/initPersephone.m b/src/analysis/persephone/initPersephone.m
@@ -49,19 +49,36 @@
     paths (1, :) {mustBeNonempty}
 end
 
-%%% Test for Matlab toolbox dependency issues %%%
 
-% Check if the parallel toolbox is installed (Required)
-if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
-    error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
+%%% Test for Matlab toolbox dependency issues %%%
+%fix for error from missing toolbox - stats and ML - wb 20250305
+sysAddons = matlab.addons.installedAddons();
+if any(strcmp(sysAddons.Name,'Parallel Computing Toolbox'))
+    % Check if the parallel toolbox is installed (Required)
+    if ~matlab.addons.isAddonEnabled('Parallel Computing Toolbox')
+        error('It seems the Paralell Computing Toolbox is not installed. Please consider installing it via the add-on option in MATLAB, it is required to generate microbiome models, HM models and to generate flux results.')
+    else
+        if ~license('test','Distrib_Computing_Toolbox')
+          error('It seems the Paralell Computing Toolbox is installed but no valid license exists. Please consider updating/obtaining a license, it is required to generate microbiome models, HM models and to generate flux results.')
+        end   
+    end
 end
-
 % Check if the statistics toolbox is installed (Not critical, but recommended)
-if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
-    statToolboxInstalled = false;
-    warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
+if any(strcmp(sysAddons.Name,'Statistics and Machine Learning Toolbox'))
+    if ~matlab.addons.isAddonEnabled('Statistics and Machine Learning Toolbox')
+        statToolboxInstalled = false;
+        warning('It seems the Statistics and Machine Learning Toolbox is not installed. Please consider installing it via the add-on option in MATLAB as it is required for analysis')
+    else
+        if license('test', 'Statistics_Toolbox')
+        statToolboxInstalled = true;
+        else 
+         warning('It seems the Statistics and Machine Learning Toolbox installed but no valid license exists. Please consider updating/obtaining a license as it is required for analysis')
+        statToolboxInstalled = false;
+        end    
+    end
 else
-    statToolboxInstalled = true;
+    statToolboxInstalled = false;
+
 end
 
 

diff --git a/src/analysis/persephone/persWBM.m b/src/analysis/persephone/persWBM.m
@@ -135,10 +135,30 @@
 if ~ischar(metadataPath)
     error('metadata data does not satisfy the input requirments (iscell, ischar, istable)')
 else
-    Data = readMetadataForPersephone(metadataPath);
+    % Read the metadata table. If the variable names have more than 64
+    % characters, the variable names in the table will be truncated. We account
+    % for this later, so this warning is ignored here.
+    warning('off')
+    opts = detectImportOptions(metadataPath, 'VariableNamingRule', 'preserve');
+    opts = setvartype(opts, opts.VariableNames(1), 'string');
+    Data = readtable(metadataPath, opts);
+    warning('on')
+
+    % Check if metadata table is not empty
+    validateattributes(Data, {'table'}, {'nonempty'}, mfilename, 'metadataTable')
+
+    % The variable names in the metadata table will be truncated if the names
+    % are longer than 64 characters. Read the true variable names and store
+    % them in the VariableDescriptions property.
+    % Next, the first 2 lines of the metadata are loaded
+    unprocessedMetadata = strrep(metadataPath, '_processed', '');
+    metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
+    Data.Properties.VariableUnits = string(metadataCell(2,:));
+
 end
 
 
+
 % Check Data has units stored in table properties
 if ~strcmp('ID', Data.Properties.VariableNames{1})
     error('Please ensure column 1 of your metadata is ID and is labelled accordingly')

diff --git a/src/analysis/persephone/readMetadataForPersephone.m b/src/analysis/persephone/readMetadataForPersephone.m
@@ -45,28 +45,13 @@
 % Check if metadata table is not empty
 validateattributes(metadataTable, {'table'}, {'nonempty'}, mfilename, 'metadataTable')
 
-% The variable names in the metadata table will be truncated if the names
-% are longer than 64 characters. Read the true variable names and store
-% them in the VariableDescriptions property.
-% Next, the first 2 lines of the metadata are loaded 
-unprocessedMetadata = strrep(metadataPath, '_processed', '');
-metadataCell = readcell(unprocessedMetadata,"TextType","string",'Range', '1:2');
-metadataTable.Properties.VariableDescriptions = string(metadataCell(1,:));
-
-% If the metadata table has a second row with units, we need to remove that
-% row from the loaded metadataTable and store it in the VariableUnits
-% property.
-if matches(metadataCell{2,1},{'unit','units'},'ignoreCase',true)
 
+% % If the metadata table has a second row with units, we need to remove that
+% % row from the loaded metadataTable and store it in the VariableUnits
+% % property.
+if any(matches(metadataTable{1,1},{'unit','units'},'ignoreCase',true))
     % Remove the second header line from the table
-    % metadataTable(1,:) = [];
-
-    % Get the variable units
-    units = string(metadataCell(2,:));
-    units(ismissing(units)) = "NA";
-
-    % Add unit information to the table
-    metadataTable.Properties.VariableUnits = units;
+    metadataTable(1,:) = [];
 end
 
 % Set output

diff --git a/src/analysis/persephone/runMars.m b/src/analysis/persephone/runMars.m
@@ -57,7 +57,8 @@ function runMars(pythonPath, marsRepoPath, cutoffMARS, outputExtensionMARS, flag
 py.importlib.import_module('MARS');
 
 %Enter folder that contains the "main.py" script
-cd(strcat(marsRepoPath, '\MARS'));
+
+cd(fullfile(marsRepoPath, 'MARS'));
 
 % Set all optional inputs in Python readable format
 marsOptArg = pyargs('cutoff', cutoffMARS, ...

diff --git a/src/analysis/persephone/runPersephone.m b/src/analysis/persephone/runPersephone.m
@@ -273,7 +273,7 @@
 tic;
 % Load all variables defined in the configPersephone.m
 run(configPath);
-
+resultPath
 % Do not run MARS if seqC was run.
 if paths.seqC.flagSeqC == true
     paths.Mars.paths.Mars.flagMars = false;
@@ -284,13 +284,15 @@
 
 % Validate the set workspace variables and give descriptive errors on which
 % variables were not set correctly.
+resultPath
 validatePersephoneInputs(paths,resultPath);
 
 % Create log file on the pipeline setup
-diary([resultPath filesep 'logFile_initialisation.txt'])
+resultPath
+diary(fullfile(resultPath, 'logFile_initialisation.txt'))
 
 % Initialise PERSEPHONE
-[~, ~, updatedMetadataPath] = initPersephone(resultPath, paths);
+[~, statToolboxInstalled, updatedMetadataPath] = initPersephone(resultPath, paths);
 paths.General.metadataPath = updatedMetadataPath;
 
 % Validate the path to the given diet.
@@ -412,46 +414,43 @@
     disp(' > Calculation of relative abundances with MARS is skipped')
 end
 diary('off'); % Save MARS logfile
-%% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
-% Remove samples in the metadata that are not in the microbiome data
-if ~isempty(paths.Mars.relAbunFilePath)
-
-    % Load microbiome data
-    relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');
-
-    % Check if the microbiome table is empty
-    validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')
-
-    % Check if a column named Taxon exists
-    if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
-        error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
-    end
-
-    % Read in the metadata
-    metadata = readMetadataForPersephone(paths.General.metadataPath);
-
-    % Find the intersection of samples between the readsTable and the
-    % metadata table
-    [~,ia,ib] = intersect(metadata.ID, ...
-        relAbunFile.Properties.VariableNames(2:end)','stable');
-
-    % Perform checks
-    if isempty(ia)
-        error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
-    else
-        numRemovedSamples = size(metadata,1)-numel(ia);
-        if numRemovedSamples>0
-            disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
-        end
-        relAbunFile = relAbunFile(:,[1; 1+ib]);
-        metadata = metadata(ia,:);
-    end
-
-    % Remove all samples in the metadata that are not in the readsTable
-    disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
-    writetable(relAbunFile, paths.Mars.relAbunFilePath);
-    writetable(metadata,paths.General.metadataPath);
-end
+% %% Adjust metadata to remove any potential missing samples compared to the microbiome sample.
+% % Remove samples in the metadata that are not in the microbiome data
+% if ~isempty(paths.Mars.relAbunFilePath)
+% 
+%     % Load microbiome data
+%     relAbunFile = readtable(paths.Mars.relAbunFilePath,'VariableNamingRule','preserve');
+% 
+%     % Check if the microbiome table is empty
+%     validateattributes(relAbunFile, {'table'}, {'nonempty'}, mfilename, 'readsTable')
+% 
+%     % Check if a column named Taxon exists
+%     if ~ismember('Taxon', relAbunFile.Properties.VariableNames)
+%         error('COBRA:BadInput', 'Microbiome read table must contain an Taxon column')
+%     end
+% 
+%     % Read in the metadata
+%     metadata = readMetadataForPersephone(paths.General.metadataPath);
+% 
+%     % Find the intersection of samples between the readsTable and the
+%     % metadata table
+%     [~,ia] = intersect(metadata.ID, ...
+%         relAbunFile.Properties.VariableNames(2:end)','stable');
+%     % Perform checks
+%     if isempty(ia)
+%         error('COBRA:BadInput', 'No overlapping samples could be found between the reads table and the metadata table.')
+%     else
+%         numRemovedSamples = size(metadata,1)-numel(ia);
+%         if numRemovedSamples>0
+%             disp(strcat("> Removed ", string(numRemovedSamples), " samples in the metadata that were not present in the reads table."))
+%         end
+%         metadata = metadata(ia,:);
+%     end
+% 
+%     % Remove all samples in the metadata that are not in the readsTable
+%     disp(strcat("> ", string(size(metadata,1)), " samples were included in this study."))
+%     writetable(metadata,paths.General.metadataPath);
+% end
 
 %% Part 2:
 %%%%%%%%%%%%%%%%%%%%%%% Create microbiome models %%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -611,7 +610,7 @@
 %% Part 6: 
 %%%%%%%%%%%%%%%%%%%%%% Analysis of flux results %%%%%%%%%%%%%%%%%%%%%%%%%%%
 tic;
-if paths.stats.flagStatistics 
+if paths.stats.flagStatistics && statToolboxInstalled
     diary([resultPath filesep 'logFile_statistics.txt']) % Open diary for log file
 
     for i = 1:size(paths.stats.response,2)
@@ -644,7 +643,7 @@
 end
 totalT= 0;
 fields = fieldnames(progress);  
-for i = 1:numel(fields)
+for i = 1:numel(fields)-1
     data = progress.(fields{i});  
     totalT = totalT + data(2); 
 end

diff --git a/src/analysis/persephone/validatePersephoneInputs.m b/src/analysis/persephone/validatePersephoneInputs.m
@@ -109,9 +109,9 @@
 
 % Validate inputs for statistics
 if paths.stats.flagStatistics
-    validateattributes(paths.stats.outputPathStatistics,{'char','string'},{'nonempty'},'outputPathStatistics')
+    validateattributes(paths.stats.outputPathStatistics,{'char','string','cell'},{'nonempty'},'outputPathStatistics')
     validateattributes(paths.stats.response,{'char','string','cell'},{'nonempty'},'response')
-    validateattributes(paths.stats.confounders,{'char','string','cell'},{'nonempty'},'confounders')
+    validateattributes(paths.stats.confounders,{'char','string','cell'},{''},'confounders')
     validateattributes(paths.stats.moderationAnalysis,{'logical'},{'scalar'},'moderationAnalysis')
 end