Skip to content

Commit

Permalink
Mark datasets with data size over 10GiB as unsupported
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Mar 8, 2024
1 parent fb95ad8 commit ec32377
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 4 deletions.
12 changes: 10 additions & 2 deletions src/cellxgene/job-generator.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { stat } from 'fs/promises';
import { Config } from '../util/config.js';
import { UnknownOrganError } from '../util/errors.js';
import { DATASET_DATA_MAX_SIZE, DEFAULT_DATASET_DATA_MAX_SIZE } from '../util/constants.js';
import { DataTooLargeError, UnknownOrganError } from '../util/errors.js';
import { IJobGenerator } from '../util/handler.js';

/** @implements {IJobGenerator} */
Expand All @@ -11,11 +13,17 @@ export class JobGenerator {

async prepareJobs(datasets) {}

createJob(dataset) {
async createJob(dataset) {
if (!dataset.organ) {
throw new UnknownOrganError(`${dataset.tissue} (${dataset.tissueId})`);
}

const { size } = await stat(dataset.dataFilePath);
const maxSize = this.config.get(DATASET_DATA_MAX_SIZE, DEFAULT_DATASET_DATA_MAX_SIZE);
if (size > maxSize) {
throw new DataTooLargeError(size);
}

return {
organ: dataset.organ,
geneColumn: 'feature_name',
Expand Down
4 changes: 2 additions & 2 deletions src/generate-jobs/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { getSummaryRef } from '../util/common.js';
import { concurrentMap } from '../util/concurrent-map.js';
import { Config } from '../util/config.js';
import { ALGORITHMS, DEFAULT_MAX_CONCURRENCY, MAX_CONCURRENCY } from '../util/constants.js';
import { UnknownOrganError } from '../util/errors.js';
import { DataTooLargeError, UnknownOrganError } from '../util/errors.js';
import { fileExists } from '../util/fs.js';
import { getCrosswalkingFilePath } from '../util/paths.js';
import { createSpecs } from './spec.js';
Expand Down Expand Up @@ -58,7 +58,7 @@ async function tryGenerateJobs(dataset, crosswalks, config) {
await writeFile(filePath, specString, { encoding: 'utf8' });
}
} catch (error) {
if (error instanceof UnknownOrganError) {
if (error instanceof UnknownOrganError || error instanceof DataTooLargeError) {
ALGORITHMS.forEach((step) => ref.setNotSupported(step));
ref.comments = error.message;
} else {
Expand Down
2 changes: 2 additions & 0 deletions src/util/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const DATASET_LIST = 'DATASET_LIST';
export const DATASET_LIST_URL = 'DATASET_LIST_URL';
export const DATASET_COLUMN_ID = 'DATASET_COLUMN_ID';
export const DATASET_MIN_CELL_COUNT = 'DATASET_MIN_CELL_COUNT';
export const DATASET_DATA_MAX_SIZE = 'DATASET_DATA_MAX_SIZE';

export const REQUIRED_ENV_VARIABLES = [DATASET, DATASETS_DIR, OUTPUT_DIR, DATA_REPO_DIR, MODELS_DIR, SRC_DIR];

Expand All @@ -38,3 +39,4 @@ export const DEFAULT_CACHE_DIR = './tmp';
export const DEFAULT_DATASET_HANDLERS = ['hubmap', 'sennet', 'gtex', 'cellxgene'];
export const DEFAULT_DATASET_LIST = 'listing.csv';
export const DEFAULT_DATASET_MIN_CELL_COUNT = 100;
export const DEFAULT_DATASET_DATA_MAX_SIZE = 10 * 1024 * 1024 * 1024; // 10GiB
28 changes: 28 additions & 0 deletions src/util/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,31 @@ export class UnknownOrganError extends BaseError {
this.code = code;
}
}

/**
* Formats size as a human readable string
* Adapted from: https://stackoverflow.com/a/1094933
*
* @param {number} size Size in bytes
*/
function formatSize(size) {
const units = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi'];
for (const unit in units) {
if (size < 1024) {
return `${size.toFixed(2)}${unit}B`;
}
size /= 1024;
}
return `${size.toFixed(3)}YiB`;
}

/**
* Error indicating that the data.h5ad file is too large to be supported.
* Stops processing of the associated dataset.
*/
export class DataTooLargeError extends BaseError {
constructor(size) {
super(`Data size is to large (${formatSize(size)})`);
this.size = size;
}
}

0 comments on commit ec32377

Please sign in to comment.