diff --git a/pyproject.toml b/pyproject.toml index a8c9a0d..c3905ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,8 +34,7 @@ pandas = "^2.2.3" omit = [".*", "*/site-packages/*", "Snakefile"] [tool.black] -line-length = 87 -target-version = ['py311'] +line-length = 88 [build-system] requires = ["poetry-core"] diff --git a/snakemake_executor_plugin_slurm/__init__.py b/snakemake_executor_plugin_slurm/__init__.py index 2abb5ba..b534f90 100644 --- a/snakemake_executor_plugin_slurm/__init__.py +++ b/snakemake_executor_plugin_slurm/__init__.py @@ -45,6 +45,7 @@ ) from .efficiency_report import create_efficiency_report from .submit_string import get_submit_command +from .validation import validate_or_get_slurm_job_id, validate_slurm_extra from .partitions import read_partition_file, get_best_partition from .validation import ( validate_slurm_extra, @@ -539,8 +540,10 @@ def run_job(self, job: JobExecutorInterface): # To extract the job id we split by semicolon and take the first # element (this also works if no cluster name was provided) slurm_jobid = out.strip().split(";")[0] - if not slurm_jobid: - raise WorkflowError("Failed to retrieve SLURM job ID from sbatch output.") + # this slurm_jobid might be wrong: some cluster admin give convoluted + # sbatch outputs. So we need to validate it properly (and replace it + # if necessary). + slurm_jobid = validate_or_get_slurm_job_id(slurm_jobid, out) slurm_logfile = slurm_logfile.with_name( slurm_logfile.name.replace("%j", slurm_jobid) ) diff --git a/snakemake_executor_plugin_slurm/validation.py b/snakemake_executor_plugin_slurm/validation.py index 497ad6a..4931a29 100644 --- a/snakemake_executor_plugin_slurm/validation.py +++ b/snakemake_executor_plugin_slurm/validation.py @@ -8,6 +8,64 @@ from .job_status_query import get_min_job_age, is_query_tool_available +def validate_or_get_slurm_job_id(job_id, output): + """ + Validate that the SLURM job ID is a positive integer. + + Args: + job_id (str): The SLURM job ID to validate. + output (str): The full sbatch output to parse if job_id is invalid. + + Raises: + WorkflowError: If the job ID is not a positive integer or we cannot + determine a valid job ID from the given input string. + """ + # this regex just matches a positive integer + # strict validation would require to check for a JOBID with either + # the SLURM database or control daemon. This is too much overhead. + if re.match(r"^\d+$", job_id): + return job_id + else: + # Try matching a positive integer, raise an error if more than one match or + # no match found. Match standalone integers, excluding those followed by %, + # letters, or digits (units/percentages/floats). Allows format: "1234" or + # "1234; clustername" (SLURM multi-cluster format). + + # If the first attempt to validate the job fails, try parsing the sbatch output + # a bit more sophisticatedly. + # The regex below matches standalone positive integers with a word boundary + # before the number. The number must NOT be: + # - Part of a decimal number (neither before nor after the dot) + # - Followed by a percent sign with optional space (23% or 23 %) + # - Followed by units/counts with optional space: + # * Memory units: k, K, m, M, g, G, kiB, KiB, miB, MiB, giB, GiB + # * Resource counts: files, cores, hours, cpus/CPUs (case-insensitive) + # * minutes are excluded, because of the match to 'm' for Megabytes + # Units must be followed by whitespace, hyphen, period, or end of string + # Use negative lookbehind to exclude digits after a dot, and negative lookahead + # to exclude digits before a dot or followed by units/percent + matches = re.findall( + r"(? 1: + raise WorkflowError( + f"Multiple possible SLURM job IDs found in: {output}. " + "Was looking for exactly one positive integer." + ) + elif not matches: + raise WorkflowError( + f"No valid SLURM job ID found in: {output}. " + "Was looking for exactly one positive integer. " + "We tried our best to parse the sbatch output, but it appears " + "too convoluted. Please run 'sbatch' manually and report the output " + "to us. If the output is garbled, inform your cluster administrators." + ) + + def get_forbidden_slurm_options(): """ Return a dictionary of forbidden SLURM options that the executor manages. diff --git a/tests/tests.py b/tests/tests.py index 955bc8f..fb6cbaf 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -13,8 +13,10 @@ ) from snakemake_executor_plugin_slurm.utils import set_gres_string from snakemake_executor_plugin_slurm.submit_string import get_submit_command - -from snakemake_executor_plugin_slurm.validation import validate_slurm_extra +from snakemake_executor_plugin_slurm.validation import ( + validate_slurm_extra, + validate_or_get_slurm_job_id, +) from snakemake_interface_common.exceptions import WorkflowError import pandas as pd @@ -863,3 +865,193 @@ def test_multiple_forbidden_options(self, mock_job): # Should raise error for job-name (first one encountered) with pytest.raises(WorkflowError, match=r"job-name.*not allowed"): validate_slurm_extra(job) + + +class TestSlurmJobIdValidation: + """Test cases for the validate_or_get_slurm_job_id function.""" + + def test_parsable_format_simple(self): + """Test parsable format with just job ID.""" + output = "12345" + result = validate_or_get_slurm_job_id("12345", output) + assert result == "12345" + + def test_parsable_format_with_cluster(self): + """Test parsable format with cluster name (jobid;clustername).""" + output = "54321;mycluster" + result = validate_or_get_slurm_job_id("54321", output) + assert result == "54321" + + def test_convoluted_output_with_percentages(self): + """Test extraction from output containing percentages.""" + output = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. +Maecenas quis risus porttitor: 25% +pretium enim volutpat: 23.3% +Submitted batch job 88888 +some other text""" + result = validate_or_get_slurm_job_id("88888", output) + assert result == "88888" + + def test_convoluted_output_with_units(self): + """Test extraction from output containing memory/size units.""" + output = """System information: +Memory available: 256 GiB +CPU usage: 12 cores +Storage: 500 G +Job ID: 77777 +Files processed: 1500 files""" + result = validate_or_get_slurm_job_id("77777", output) + assert result == "77777" + + def test_convoluted_output_mixed(self): + """Test extraction with percentages, decimals, and units mixed.""" + output = """Cluster status report +Queue utilization: 75.5% +Memory allocated: 128 KiB per node +Disk usage: 23.3 % of quota +Allocated space: 50 G +Processing 3000 files +Your job 123456 has been submitted +""" + result = validate_or_get_slurm_job_id("123456", output) + assert result == "123456" + + def test_job_id_at_beginning(self): + """Test extraction when job ID appears at the start.""" + output = """999888 submitted successfully +Memory: 64 GiB +Nodes: 4""" + result = validate_or_get_slurm_job_id("999888", output) + assert result == "999888" + + def test_job_id_in_middle(self): + """Test extraction when job ID is in the middle of output.""" + output = """Configuration loaded: 100% +Job 444555 queued +Estimated wait time: 5.5 minutes""" + result = validate_or_get_slurm_job_id("444555", output) + assert result == "444555" + + def test_output_with_lowercase_units(self): + """Test that lowercase units are properly excluded.""" + output = """Memory: 32 m +Storage: 100 k files +Job: 333222""" + result = validate_or_get_slurm_job_id("333222", output) + assert result == "333222" + + def test_output_with_cores_and_cpus(self): + """Test that numbers followed by 'cores' or 'cpus' are excluded.""" + output = """System resources: +Allocated: 16 cores +Available CPUs: 32 +Using 8 cpus +Active cpu: 1 +Job ID: 555666""" + result = validate_or_get_slurm_job_id("555666", output) + assert result == "555666" + + def test_output_with_mixed_case_units(self): + """Test mixed case memory units (MiB, GiB, etc.).""" + output = """Allocated: 512 MiB +Reserved: 2 GiB +Cache: 128 KiB +Job ID is 111222""" + result = validate_or_get_slurm_job_id("111222", output) + assert result == "111222" + + def test_decimal_numbers_excluded(self): + """Test that decimal numbers are not matched as job IDs.""" + output = """Performance: 23.3 MB/s +Efficiency: 99.9% +Job: 666777 +Load: 1.5""" + result = validate_or_get_slurm_job_id("666777", output) + assert result == "666777" + + def test_percentage_with_space(self): + """Test percentages with space before % sign.""" + output = """Completion: 45 % +Progress: 78.5 % +Job ID: 555444""" + result = validate_or_get_slurm_job_id("555444", output) + assert result == "555444" + + def test_units_with_hyphen(self): + """Test units followed by hyphen.""" + output = """Memory: 256M-512M range +Job: 888999""" + result = validate_or_get_slurm_job_id("888999", output) + assert result == "888999" + + def test_units_with_period(self): + """Test units followed by period.""" + output = """Allocated 128G. Starting job 777888.""" + result = validate_or_get_slurm_job_id("777888", output) + assert result == "777888" + + def test_multiple_job_ids_error(self): + """Test that multiple possible job IDs raise an error.""" + output = """Previous job: 11111 +New job: 22222 +Both are active""" + with pytest.raises( + WorkflowError, match=r"Multiple possible SLURM job IDs found" + ): + validate_or_get_slurm_job_id("invalid", output) + + def test_no_valid_job_id_error(self): + """Test that output with no valid job ID raises an error.""" + output = """Error: 23.3% +Memory: 128 GiB +Status: 99.9% complete""" + with pytest.raises(WorkflowError, match=r"No valid SLURM job ID found"): + validate_or_get_slurm_job_id("invalid", output) + + def test_complex_multiline_output(self): + """Test complex realistic SLURM output.""" + output = """ +╔══════════════════════════════════════════════════════════════════════════════╗ +║ SLURM Job Submission ║ +╚══════════════════════════════════════════════════════════════════════════════╝ + +Cluster Information: + - Queue utilization: 67.8% + - Available memory: 512 GiB + - Free storage: 2.5 TiB (1500 files pending) + +Job Configuration: + - Requested memory: 64 GiB + - Requested time: 48.5 hours + - Cores: 16 + +Submitting job 202411170001 to cluster... + +Status: + - Queue position: 23 + - Estimated start: 15.3 minutes +""" + result = validate_or_get_slurm_job_id("202411170001", output) + print(result) + assert result == "202411170001" + + def test_invalid_job_id_then_extract_from_output(self): + """Test fallback extraction when initial job_id is malformed.""" + output = "Submitted batch job 12345" + # Pass invalid job_id, should extract from output + result = validate_or_get_slurm_job_id("invalid_id", output) + assert result == "12345" + + def test_job_id_with_trailing_content(self): + """Test job ID followed by various trailing content.""" + output = "Job 123456 submitted to partition main" + result = validate_or_get_slurm_job_id("123456", output) + assert result == "123456" + + def test_numbers_in_filenames_excluded(self): + """Test that numbers in 'files' counts are excluded.""" + output = """Quota 250 files +Used 100 files +Job ID: 999000""" + result = validate_or_get_slurm_job_id("999000", output) + assert result == "999000"